# Problem 3: Regression
For this problem, you need to use the built-in sklearn California Housing dataset. You can load this data using
-- from sklearn.datasets import fetch_california_housing
-- cal_housing = fetch_california_housing()

Divide the data into training and test sets using train_test_split and random_state=38

The goal is to experiment with few regression algorithms and compare their performance on this data.
1. Build and train a LASSO Regression model. Vary the constraint parameter α and analyze the results by identifying cases of overfitting and underfitting. Select the optimal value of α and justify your choice.
2. Build and train a Decision Tree regression model. Vary the pruning parameter and analyze the results by identifying cases of overfitting and underfitting. Select the optimal pruning and justify your choice.
3. Compare the accuracy of the 2 methods and the relevant features identified by each method and comment on the results.

import libraries

In [None]:
import operator

import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas
import pandas as pd
import sklearn_evaluation
import sns as sns
from numpy import random
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor

random.seed(38)
np.random.seed(38)

import data

In [None]:
from sklearn.datasets import fetch_california_housing

cal_housing = fetch_california_housing()

X_train, X_test, y_train, y_test = train_test_split(
    cal_housing['data'],
    cal_housing['target'],
    random_state=38,
)

print(len(X_train))
print(len(X_test))

## LASSO

In [None]:

def grid_search_lasso(alphas):
    lasso_reg = Lasso(random_state=38)
    hyperparameters = {'alpha': alphas}
    grid_search = GridSearchCV(
        lasso_reg,
        hyperparameters,
        cv=4,
        return_train_score=True
    )
    grid_search.fit(X_train, y_train)

    print_grid_search_results(grid_search, hyperparameters)
    return grid_search


def print_grid_search_results(grid_search, parameters):
    for parameter in parameters:
        print(f'Best {parameter}:', grid_search.best_params_[parameter])
    print("Train Score: ", grid_search.score(X_train, y_train))
    print("Test Score: ", grid_search.score(X_test, y_test))


def graph_grid_results_lasso(grid_results, scale):
    alphas = pd.DataFrame(grid_results.cv_results_['params'])
    train_scores = pd.DataFrame(grid_results.cv_results_['mean_train_score'], columns=['Train'])
    test_scores = pd.DataFrame(grid_results.cv_results_['mean_test_score'],  columns=['Test'])
    display_frame = pd.concat([train_scores, test_scores], axis=1)
    print(alphas.values.flatten())
    print(display_frame)
    display_frame.index = alphas.values.flatten()
    # display_test = pd.concat([alphas, test_scores], axis=1)
    if scale=='log':
        display_frame.plot(logx=True)
    else:
        display_frame.plot()

    # display_test.plot()
    # fig, ax = plt.subplots()
    # ax.set_xscale('log')
    # ax.plot([alpha[0] for alpha in results], [train[1] for train in results])
    # ax.plot([alpha[0] for alpha in results], [test[2] for test in results])
    # plt.show()

# test_scores_frame = alphas_frame.append(
#     pd.DataFrame(grid_results.cv_results_['mean_test_score'])
# )
# train_scores_frame.plot
# ax.set_xscale(scale)

Trial 1

In [None]:
alphas = np.logspace(0, 4, 20)
grid_search_lasso_t1 = grid_search_lasso(alphas)

In [None]:
graph_grid_results_lasso(grid_search_lasso_t1, 'log')

Trial 2

In [None]:
alphas = np.logspace(-4, 0, 20)
grid_search_lasso_t2 = grid_search_lasso(alphas)

In [None]:
graph_grid_results_lasso(grid_search_lasso_t2, 'log')

In [None]:
alphas = np.linspace(0.0001, 0.002, 200)
grid_search_lasso_t3 = grid_search_lasso(alphas)

In [None]:
graph_grid_results_lasso(grid_search_lasso_t3, 'linear')

optimal lasso

In [None]:
optimal_lasso = Lasso(alpha=1)
optimal_lasso.fit(X_train, y_train)

print("Train Score: ", optimal_lasso.score(X_train, y_train))
print("Test Score: ", optimal_lasso.score(X_test, y_test))

## Decision Tree regression

In [None]:
default_dt = DecisionTreeRegressor(random_state=38)
default_dt.fit(X_train, y_train)
print("Train Score: ", default_dt.score(X_train, y_train))
print("Test Score: ", default_dt.score(X_test, y_test))
print(default_dt.get_depth())

general functions

In [None]:
def grid_search_decision_tree(
        max_depths,
        min_samples_splits,
        min_samples_leafs=None
):
    decision_tree = DecisionTreeRegressor(random_state=38)
    hyperparameters = {
        'max_depth': max_depths,
        'min_samples_split': min_samples_splits,
    }
    include_additional_parameters(min_samples_leafs, hyperparameters)

    grid_search = GridSearchCV(
        decision_tree,
        hyperparameters,
        cv=4
    )
    grid_search.fit(X_train, y_train)

    print_grid_search_results(grid_search, hyperparameters)
    return grid_search


def include_additional_parameters(min_sample_leafs, parameters):
    if min_sample_leafs is not None:
        parameters.update({'min_samples_leaf': min_sample_leafs})


def print_grid_search_results(grid_search, parameters):
    for parameter in parameters:
        print(f'Best {parameter}:', grid_search.best_params_[parameter])
    print("Train Score: ", grid_search.score(X_train, y_train))
    print("Test Score: ", grid_search.score(X_test, y_test))


def graph_grid_search_decision_tree(grid_search):
    plt.figure(figsize=(16, 16), dpi=80)
    sklearn_evaluation.evaluator.plot.grid_search(
        grid_search.cv_results_,
        change=('max_depth', 'min_samples_split'),
        # subset={
        #     'max_depth': [3, 5],
        #     'min_samples_split': [3, 5]
        # }
    )

    plt.title("Decision Tree")
    plt.xlabel("max_depth")
    plt.ylabel("min_samples_split")
    # plt.legend()
    plt.show()

Trial 1 max_depth and min_samples_splits

In [None]:
max_depths = np.logspace(0, 3, 20, dtype=int)
min_samples_splits = np.logspace(0, 3, 20, dtype=int)

grid_results_t1 = % time grid_search_decision_tree(max_depths, min_samples_splits)

In [None]:
print(grid_results_t1)
graph_grid_search_decision_tree(grid_results_t1)


In [None]:
max_depths = np.linspace(0, 20, 20, dtype=int)
min_samples_splits = np.linspace(0, 100, 20, dtype=int)

grid_results_t2 = grid_search_decision_tree(max_depths, min_samples_splits)

In [None]:
graph_grid_search_decision_tree(grid_results_t2)

In [None]:
max_depths = np.linspace(15, 25, 10, dtype=int)
min_samples_splits = np.linspace(30, 60, 10, dtype=int)

grid_results_t3 = grid_search_decision_tree(max_depths, min_samples_splits)

In [None]:
graph_grid_search_decision_tree(grid_results_t3)

trial with 3 variables

In [None]:
max_depths = np.linspace(10, 20, 5, dtype=int)
min_samples_splits = np.linspace(50, 60, 5, dtype=int)
min_samples_leafs = np.linspace(1, 10, 5, dtype=int)

grid_results_t4 = grid_search_decision_tree(
    max_depths,
    min_samples_splits,
    min_samples_leafs
)

trial 6

In [None]:
max_depths = np.linspace(10, 30, 10, dtype=int)
min_samples_splits = np.linspace(40, 60, 10, dtype=int)
min_samples_leafs = np.linspace(1, 20, 10, dtype=int)

grid_results_t6 = grid_search_decision_tree(
    max_depths,
    min_samples_splits,
    min_samples_leafs
)

plot all 3 vars

In [None]:
def plot_3d_heatmap(grid_results):
    result_params = grid_results.cv_results_['params']
    df = pd.DataFrame(result_params)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    x = df.loc[:, 'max_depth']
    y = df.loc[:, 'min_samples_split']
    z = df.loc[:, 'min_samples_leaf']
    c = grid_results.cv_results_['mean_test_score']
    img = ax.scatter(x, y, z, c=c, cmap=plt.hot())
    fig.colorbar(img)
    plt.show()


plot_3d_heatmap(grid_results_t6)


In [None]:
max_depths = np.linspace(1, 60, 10, dtype=int)
min_samples_splits = np.linspace(1, 60, 10, dtype=int)
min_samples_leafs = np.linspace(1, 60, 10, dtype=int)

grid_results_t7 = grid_search_decision_tree(
    max_depths,
    min_samples_splits,
    min_samples_leafs
)

In [None]:
plot_3d_heatmap(grid_results_t7)

t2

In [None]:
max_depths = np.linspace(10, 30, 10, dtype=int)
min_samples_splits = np.linspace(30, 60, 10, dtype=int)
min_samples_leafs = np.linspace(5, 25, 10, dtype=int)

grid_results_t9 = grid_search_decision_tree(
    max_depths,
    min_samples_splits,
    min_samples_leafs
)

In [None]:
plot_3d_heatmap(grid_results_t9)

trial 3

In [None]:
max_depths = np.linspace(10, 18, 8, dtype=int)
min_samples_splits = np.linspace(32, 40, 8, dtype=int)
min_samples_leafs = np.linspace(7, 15, 8, dtype=int)

grid_results_t10 = grid_search_decision_tree(
    max_depths,
    min_samples_splits,
    min_samples_leafs
)

In [None]:
plot_3d_heatmap(grid_results_t10)

In [None]:
max_depths = np.linspace(8, 16, 8, dtype=int)
min_samples_splits = np.linspace(46, 54, 8, dtype=int)
min_samples_leafs = np.linspace(20, 28, 8, dtype=int)

grid_results_11 = grid_search_decision_tree(
    max_depths,
    min_samples_splits,
    min_samples_leafs
)

Optimal Decision Tree after training

In [None]:
optimal_decision_tree = DecisionTreeRegressor(
    max_depth=13,
    min_samples_split=48,
    min_samples_leaf=11,
    random_state=38
)
optimal_decision_tree.fit(X_train, y_train)

print("Train Score: ", optimal_decision_tree.score(X_train, y_train))
print("Test Score: ", optimal_decision_tree.score(X_test, y_test))
print('depth', optimal_decision_tree.get_depth())

## Feature Importance

### Lasso feature importance

In [None]:
lasso_coefficients = optimal_lasso.coef_
print(lasso_coefficients)

In [None]:
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(random_state=38, cv=4)
lasso_cv.fit(X_train, y_train)
print(lasso_cv.alpha_)
print("Train Score: ", lasso_cv.score(X_train, y_train))
print("Test Score: ", lasso_cv.score(X_test, y_test))

optimal_lasso_from_cv = Lasso(alpha=lasso_cv.alpha_)
optimal_lasso_from_cv.fit(X_train, y_train)
print("Train Score: ", optimal_lasso_from_cv.score(X_train, y_train))
print("Test Score: ", optimal_lasso_from_cv.score(X_test, y_test))


In [None]:
# fig, ax = plt.subplots()
# fig.set_figheight(4)
# fig.set_figwidth(4)
# plt.bar([x for x in range(len(lasso_coefficients))], lasso_coefficients)
# plt.xlabel('Feature')
# plt.ylabel('Importance')
# plt.show()

lasso_importance_frame = pd.DataFrame(
    [lasso_coefficients],
)
lasso_importance_frame.transpose()
lasso_importance_frame.columns = cal_housing.feature_names
lasso_importance_frame.plot.bar()

### Decision tree fetaure importance

In [None]:
decision_tree_importances = optimal_decision_tree.feature_importances_

dt_importance_frame = pd.DataFrame(
    [decision_tree_importances],
)
dt_importance_frame.transpose()
dt_importance_frame.columns = cal_housing.feature_names
dt_importance_frame.plot.bar()

In [None]:
from IPython.core.display_functions import display
from sklearn.tree import export_graphviz

export_graphviz(
    optimal_decision_tree,
    out_file="tree.dot",
    feature_names=cal_housing.feature_names,
    impurity=False,
    filled=True
)

import graphviz
with open("tree.dot") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))