In [None]:
import mglearn
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import scipy as scipy
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Exercise

We shall once again play around with the Boston housing data set

## The data

In [None]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)

And again we treat it as a classification problem:

In [None]:
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)

print("Size of training set: {}".format(X_train.shape[0]))
print("Size of test set: {}".format(X_test.shape[0]))

## Model evaluation

We learn a decision tree using the standard training/test division

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, c_train)
print("Accuracy on test set: {}".format(tree.score(X_test, c_test)))

### *Exercise:*

Evaluate the accuracy on the model using cross validation (below is a code snippet that you can use as basis):
* compare the results with one you obtained above; both in relation to the 
    - scores for the individual folds
    - mean score
    - variation in the scores
* experiment with different number of folds (possibly also leave one out cross validation). How does it affect your estimates?

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
scores = cross_val_score(tree, data.data, c, cv=3)
print("Cross validation scores: {}".format(scores))

In [None]:
print("Average cross validation score: {}".format(scores.mean()))

In [None]:
print("Standard deviation of the cross validation scores: {}".format(scores.std()))

## Parameter selection

In this part of the exercise we will experiment with cross validation for the selection of parameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Fix the parameter space
parameters = {'min_samples_split': range(2,20)}
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5, return_train_score=True)
grid_search.fit(X_train, c_train)

In [None]:
print("Test score: {:.2f}".format(grid_search.score(X_test, c_test)))

In [None]:
print("Best parameter: {}".format(grid_search.best_params_))

In [None]:
print("Best cross-validation score: {}".format(grid_search.best_score_))

In [None]:
pd.DataFrame(grid_search.cv_results_)

### *Exercise:* 

In the code examples above, we used cross validation to optimize the values for `min_samples_split`. You can reuse this code in the exercises below.
* Use cross-validation to optimize at least one other parameter (controlling complexity) of the decision tree. If you are adventurous, try to perform joint optimization over several parameters.
* What are the optimal values found?
* How does the training/test scores compare to when no optimization is performed?
* Try to visualize the tree and compare with the full tree you displayed in the decision tree exercise.