# Домашнее задание

In [65]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

In [29]:
data = load_boston()

In [30]:
X = pd.DataFrame(data=data.data, columns=data.feature_names)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


In [32]:
y = pd.Series(data.target)
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
dtype: float64

In [33]:
X_scaled = StandardScaler().fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [34]:
X_scaled.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## DecisionTreeRegressor

In [42]:
tree_params = {
    'max_depth': [5, 7, 9, 11, 13, 15],
    'min_samples_leaf': [10, 13, 15, 18, 20, 25],
    'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13]
}

In [50]:
tree = DecisionTreeRegressor(random_state=17)
grid_tree = GridSearchCV(tree, tree_params, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_tree.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=17, splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [5, 7, 9, 11, 13, 15], 'min_samples_leaf': [10, 13, 15, 18, 20, 25], 'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [51]:
print(grid_tree.best_params_)
print(grid_tree.best_score_)
print(grid_tree.best_estimator_)

{'max_depth': 9, 'max_features': 12, 'min_samples_leaf': 10}
-20.09586543496935
DecisionTreeRegressor(criterion='mse', max_depth=9, max_features=12,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=17, splitter='best')


## SVR

In [54]:
svr_params = {
    'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    'epsilon': [0.1, 1.0, 5.0, 10.0, 20.0, 50.0]
}

In [55]:
svr = SVR()
grid_svr = GridSearchCV(svr, svr_params, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_svr.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'epsilon': [0.1, 1.0, 5.0, 10.0, 20.0, 50.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [56]:
print(grid_svr.best_params_)
print(grid_svr.best_score_)
print(grid_svr.best_estimator_)

{'C': 100.0, 'epsilon': 0.1}
-67.46926262749355
SVR(C=100.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)


## RandomForestRegressor

In [58]:
forest_params = {
    'n_estimators': [10, 20, 30, 40, 50, 60],
    'max_depth': [5, 7, 9, 11, 13, 15],
    'min_samples_leaf': [10, 13, 15, 18, 20, 25],
    'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13]
}

In [59]:
forest = RandomForestRegressor(random_state=17)
grid_forest = GridSearchCV(forest, forest_params, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_forest.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=17, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [10, 20, 30, 40, 50, 60], 'max_depth': [5, 7, 9, 11, 13, 15], 'min_samples_leaf': [10, 13, 15, 18, 20, 25], 'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [60]:
print(grid_forest.best_params_)
print(grid_forest.best_score_)
print(grid_forest.best_estimator_)

{'max_depth': 7, 'max_features': 9, 'min_samples_leaf': 10, 'n_estimators': 10}
-16.14493918078844
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features=9, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=None, oob_score=False, random_state=17,
           verbose=0, warm_start=False)


## Ridge

In [62]:
ridge_params = {
    'alpha': [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 50.0]
}

In [63]:
ridge = Ridge(random_state=17)
grid_ridge = GridSearchCV(ridge, ridge_params, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_ridge.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=17, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 50.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [64]:
print(grid_ridge.best_params_)
print(grid_ridge.best_score_)
print(grid_ridge.best_estimator_)

{'alpha': 0.01}
-23.177060834396126
Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=17, solver='auto', tol=0.001)


## KNeighborsRegressor

In [66]:
knn_params = {
    'n_neighbors': [3, 5, 7, 10, 12, 15, 20]
}

In [67]:
knn = KNeighborsRegressor()
grid_knn = GridSearchCV(knn, knn_params, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [3, 5, 7, 10, 12, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [68]:
print(grid_knn.best_params_)
print(grid_knn.best_score_)
print(grid_knn.best_estimator_)

{'n_neighbors': 3}
-39.212073707370735
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='uniform')


## Сравнение результатов

In [72]:
estimators = {
    'tree': grid_tree,
    'svr': grid_svr,
    'forest': grid_forest,
    'ridge': grid_ridge,
    'knn': grid_knn
}

In [73]:
for i in estimators:
    e = estimators[i]
    print(i, 'CV MSE:', e.best_score_, 'Validation MSE:', e.best_estimator_.score(X_test, y_test))

tree CV MSE: -20.09586543496935 Validation MSE: 0.6609862481749684
svr CV MSE: -67.46926262749355 Validation MSE: 0.15030498425165917
forest CV MSE: -16.14493918078844 Validation MSE: 0.8435986146769617
ridge CV MSE: -23.177060834396126 Validation MSE: 0.7112895552205787
knn CV MSE: -39.212073707370735 Validation MSE: 0.5702876710426381
