# Домашнаяя работа. Кросс-валидация

Задание:

Взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество)

Возьмите 5 любых классификаторов, подберите для них параметры и сравните итоговое качество на отложенной выборке

## 0. Библиотеки

In [312]:
# Работа с данными
import pandas as pd
import numpy as np

# Датасет
from sklearn import datasets

# Дополнительные фичи
from sklearn.model_selection import train_test_split

# Модели 
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# Работа с метаданными
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


#Визуализации
import seaborn as sns
%matplotlib inline
from jupyterthemes import jtplot
jtplot.style()
import matplotlib.pyplot as plt

## 1. Работа с данными

In [145]:
dataset = datasets.load_boston()

In [146]:
X = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target)

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [148]:
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
dtype: float64

## 2. Обучение моделей регрессии

Для простых моделей будем использовать *Grid Search*, поскольку он перебирает все параметры, а для более сложных - *Randomized Search*
*Randomized Search* - выбирает случайные значения из параметров и пробует запускать обучение с ними. 
Для сложных моделей сначала запустим *Randomized Search*, а потом на ограниченном количестве моделей *Grid Search*.

### 2.0 Линейная регрессия (Обычная)

In [156]:
# Посмотрим какой результат дает обычная линейная регрессия
model_reg = LinearRegression()

fit_intercepts = [False, True]
param_regr = dict(fit_intercept=fit_intercepts)

In [157]:
grid_reg = GridSearchCV(model_reg, param_regr, cv=5)
grid_reg.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'fit_intercept': [False, True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [159]:
print(grid_reg.best_score_)
print(grid_reg.best_estimator_)

0.7485360676638366
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)


### 2.1 Линейная регрессия (Huber Loss)

In [174]:
model_huberloss = HuberRegressor()

epsilons = np.linspace(1, 4, 11)
alphas = np.linspace(0.0001, 0.01, 100)

#Параметры в одном словаре
param_huberloss = dict(alpha=alphas, epsilon=epsilons)

In [175]:
grid_huberloss = GridSearchCV(model_huberloss, param_huberloss, cv=5)
grid_huberloss.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.0001, 0.0002, ..., 0.0099, 0.01  ]), 'epsilon': array([1. , 1.3, 1.6, 1.9, 2.2, 2.5, 2.8, 3.1, 3.4, 3.7, 4. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [176]:
print(grid_huberloss.best_score_)
print(grid_huberloss.best_estimator_)

0.7160406917567481
HuberRegressor(alpha=0.0037, epsilon=1.6, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=False)


### 2.2 "К" ближайших соседей (для задачи регрессии)

In [177]:
model_knn = KNeighborsRegressor()

n_neighbor = range(1, 30)
leaf_size = range(5, 30)
weight = ['uniform', 'distance']

#parameters
param_knn = dict(n_neighbors=n_neighbor, weights=weight, 
                     leaf_size=leaf_size)

In [178]:
grid_knn = GridSearchCV(model_knn, param_knn, cv=5)
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': range(1, 30), 'weights': ['uniform', 'distance'], 'leaf_size': range(5, 30)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [179]:
print(grid_knn.best_score_)
print(grid_knn.best_estimator_)

0.5826880125028423
KNeighborsRegressor(algorithm='auto', leaf_size=5, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=7, p=2,
          weights='distance')


### 2.4 DecisionTreeRegressor

In [293]:
grid_tree.estimator.get_params().keys()

dict_keys(['criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [309]:
model_tree = DecisionTreeRegressor()

criterions = ['mse', 'friedman_mse', 'mae'] 
min_samples_leafs = range(1,10)
min_weight_fraction_leafs = [0, 0.01, 0.1, 0.15, 0.25, 0.5] 

param_tree = dict(criterion=criterions,
                  min_samples_leaf=min_samples_leafs,
                  min_weight_fraction_leaf=min_weight_fraction_leafs)

In [310]:
grid_tree = GridSearchCV(model_tree, param_tree, cv=5)
grid_tree.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['mse', 'friedman_mse', 'mae'], 'min_samples_leaf': range(1, 10), 'min_weight_fraction_leaf': [0, 0.01, 0.1, 0.15, 0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [311]:
print(grid_tree.best_score_)
print(grid_tree.best_estimator_)

0.7929177516585688
DecisionTreeRegressor(criterion='mae', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.01,
           presort=False, random_state=None, splitter='best')


### 2.4 RandomForest

In [342]:
model_forest = RandomForestRegressor()

n_estimators_ = range(1, 15)
criterions = ['mse', 'friedman_mse', 'mae'] 
min_samples_leafs = range(1,10)

param_forest = dict(n_estimators=n_estimators_,
                    criterion=criterions,
                    min_samples_leaf=min_samples_leafs,)

In [None]:
grid_forest = GridSearchCV(model_forest, param_forest, cv=5)
grid_forest.fit(X_train, y_train)

In [None]:
print(grid_forest.best_score_)
print(grid_forest.best_estimator_)