In [None]:
import numpy as np
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

"""
There is a standard housing dataset that people tend to use to get started with machine learning. 
You can download it at https://archive.ics.uci.edu/ml/machine-learning-databases/housing/. 
We will be using a slightly modified version of the dataset, which has been provided along with the code files. 
The good thing is that scikit-learn provides a function to directly load this dataset:
"""
#if the Bostaon dataset is longer available, switch it to other dataset.
housing_data = datasets.load_boston()

"""
The sklearn.utils.shuffle() function shuffles arrays or 
sparse matrices in a consistent way to do random permutations of collections. 
Shuffling data reduces variance and makes sure that the patterns remain general and less overfitted.
The random_state parameter controls how we shuffle data so that we can have reproducible results. 
"""
X, y = shuffle(housing_data.data, housing_data.target, random_state=7)

# Let's divide the data into training and testing. We'll allocate 80% for training and 20% for testing:
num_training = int(0.8 * len(X))
X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]

"""
We are now ready to fit a decision tree regression model.  
Let's pick a tree with a maximum depth of 4, 
which means that we are not letting the tree become arbitrarily deep:
"""
dt_regressor = DecisionTreeRegressor(max_depth=4)
dt_regressor.fit(X_train, y_train)

"""
Fit the decision tree regression model with AdaBoost:
The AdaBoostRegressor function has been used to compare the results and 
see how AdaBoost really boosts the performance of a decision tree regressor.
"""
ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7)
ab_regressor.fit(X_train, y_train)

"""
Let's evaluate the performance of the decision tree regressor:
1. predict() function to predict the response variable based on the test data.
2. calculated mean squared error and explained variance. 
3.  Mean squared error is the average of the squared difference 
between actual and predicted values across all data points in the input
4. The explained variance is an indicator that, in the form of proportion, 
indicates how much variability of our data is explained by the model in question.
"""
y_pred_dt = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_dt)
evs = explained_variance_score(y_test, y_pred_dt)
print("#### Decision Tree performance ####")
print("Mean squared error =", round(mse, 2))
print("Explained variance score =", round(evs, 2))

#Now, let's evaluate the performance of AdaBoost:

y_pred_ab = ab_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_ab)
evs = explained_variance_score(y_test, y_pred_ab)
print("#### AdaBoost performance ####")
print("Mean squared error =", round(mse, 2))
print("Explained variance score =", round(evs, 2))

"""
DecisionTreeRegressor builds a decision tree regressor. 
Decision trees are used to predict a response or class y, from several input variables; x1, x2,…,xn.
If y is a continuous response, it's called a regression tree,
if y is categorical, it's called a classification tree.

An AdaBoost regressor is a meta-estimator that starts by equipping a regressor 
on the actual dataset and adding additional copies of the regressor on the same dataset, 
but where the weights of instances are adjusted according to the error of the current prediction. 
"""

DTFImp= dt_regressor.feature_importances_
DTFImp= 100.0 * (DTFImp / max(DTFImp))
index_sorted = np.flipud(np.argsort(DTFImp))
pos = np.arange(index_sorted.shape[0]) + 0.5

plt.figure()
plt.bar(pos, DTFImp[index_sorted], align='center')
plt.xticks(pos, housing_data.feature_names[index_sorted])
plt.ylabel('Relative Importance')
plt.title("Decision Tree regressor")
plt.show()

ABFImp= ab_regressor.feature_importances_
ABFImp= 100.0 * (ABFImp / max(ABFImp))
index_sorted = np.flipud(np.argsort(ABFImp))
pos = np.arange(index_sorted.shape[0]) + 0.5

plt.figure()
plt.bar(pos, ABFImp[index_sorted], align='center')
plt.xticks(pos, housing_data.feature_names[index_sorted])
plt.ylabel('Relative Importance')
plt.title("AdaBoost regressor")
plt.show()