In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
import lib
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load data
data = lib.Dataset("YAHOO", random_state=1337, quantile_transform=True, quantile_noise=1e-3)
mu, std = data.y_train.mean(), data.y_train.std()
normalize = lambda x: ((x - mu) / std).astype(np.float32)
data.y_train, data.y_valid, data.y_test = map(normalize, [data.y_train, data.y_valid, data.y_test])

In [3]:
regressor = DecisionTreeRegressor(random_state=0)
# Initialize lists to save the losses
train_losses = []
valid_losses = []
test_losses = []

# Train the model by constructing the decision tree and calculate the MSE on the training set
regressor.fit(data.X_train, data.y_train)
mse_train = mean_squared_error(data.y_train, regressor.predict(data.X_train))
print("Train MSE before optimization: ", round(mse_train, 4))
train_losses.append(mse_train)

# Calculate the MSE on the validation set
mse_valid = mean_squared_error(data.y_valid, regressor.predict(data.X_valid))
print("Validation MSE before optimization: ", round(mse_valid, 4))
valid_losses.append(mse_valid)

# Calculate the MSE on the test set
mse_test = mean_squared_error(data.y_test, regressor.predict(data.X_test))
print("Test MSE before optimization: ", round(mse_test, 4))
test_losses.append(mse_test)

# Optimize the model parameters using grid search
param_grid = {'max_depth': [None, 5, 10, 15, 20]}
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=0), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(np.concatenate([data.X_train, data.X_valid]), np.concatenate([data.y_train, data.y_valid]))
best_regressor = grid_search.best_estimator_

# Calculate the MSE on the training, validation and test sets after optimization
mse_train_optimized = mean_squared_error(data.y_train, best_regressor.predict(data.X_train))
print("Train MSE after optimization: ", round(mse_train_optimized, 4))
train_losses.append(mse_train_optimized)

mse_valid_optimized = mean_squared_error(data.y_valid, best_regressor.predict(data.X_valid))
print("Validation MSE after optimization: ", round(mse_valid_optimized, 4))
valid_losses.append(mse_valid_optimized)

mse_test_optimized = mean_squared_error(data.y_test, best_regressor.predict(data.X_test))
print("Test MSE after optimization: ", round(mse_test_optimized, 4))
test_losses.append(mse_test_optimized)

Train MSE before optimization:  0.0161
Validation MSE before optimization:  1.2092
Test MSE before optimization:  1.1685
Train MSE after optimization:  0.6483
Validation MSE after optimization:  0.6647
Test MSE after optimization:  0.6764
