# Linear Regression

- fit_intercept
    > If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use StandardScaler before calling fit on an estimator with normalize=False

- copy_X
> If True, X will be copied; else, it may be overwritten

- n_jobs
> -1 means means using all processors

- positive
> When set to True, forces the coefficients to be positive

In [None]:
# Linear Regression
regressor = LinearRegression()
params = {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}
regressor.set_params(**params)
regressor.fit(X_train, Y_train)

Y_pred = regressor.predict(X_test)

## Lasso Regression [L1]

- alpha
    > alpha = 0 is equivalent to an ordinary least square. When alpha is very very large, all coefficients are zero. It help to minimizing sum of square of coefficients
    
- selection
> {'cyclic', 'random'}, default='cyclic'
If set to ‘random’, a random coefficient is updated every iteration. Often leads to significantly faster convergence


In [None]:
# Lasso
regressor = Lasso(alpha=0.3, normalize=True)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

# Lasso with CV
reg = LassoCV(cv=5, random_state=0)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

## Ridge Regression [L2]

Ridge regression addresses some of the problems of Ordinary Least Squares by imposing a penalty on the size of the coefficients with l2 regularization

- solver
> {‘auto’, ‘svd’, ‘cholesky’, ‘lsqr’, ‘sparse_cg’, ‘sag’, ‘saga’, ‘lbfgs’}, default=’auto’.
lbfgs It can be used only when positive is True

In [None]:
# Ridge
regressor = Ridge(alpha=0.01, normalize=True)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)
"""
    OR
"""
regressor = Ridge(alpha=.5, solver="cholesky")
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)


# Ridge with CV
clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=5).fit(X, y)

## ElasticNet Regression [L1 + L2]

- selection
> {'cyclic', 'random'}, default='cyclic'
If set to ‘random’, a random coefficient is updated every iteration. Often leads to significantly faster convergence

In [None]:
# ElasticNet
regressor = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

# ElasticNetCV
regr = ElasticNetCV(alpha=1, l1_ratio=0.5, cv=5, random_state=0)
regr.fit(X, y)

# SGD Regressor

In [None]:
# SGDRegressor with Lasso
reg = SGDRegressor(penalty="l1")
reg.fit(X, y.ravel())

# SGDRegressor with Ridge
reg = SGDRegressor(penalty="l2")
reg.fit(X, y.ravel())

# SGDRegressor with ElasticNet
reg = SGDRegressor(penalty="elasticnet", l1_ratio=0.5)
reg.fit(X, y.ravel())

# Polynomial Regression

- degree
> If a single int is given, it specifies the maximal degree of the polynomial features. If a tuple (min_degree, max_degree)

In [None]:
""" Find best degree """
rmses = []
degrees = np.arange(1, 10)
min_rmse, min_deg = 1e10, 0   # 1e10 = 1*10^10
for deg in degrees:
    poly_features = PolynomialFeatures(degree=deg, include_bias=False)
    x_poly_train = poly_features.fit_transform(X_train)
    poly_reg = LinearRegression()
    poly_reg.fit(x_poly_train, Y_train)
    x_poly_test = poly_features.fit_transform(X_test)
    poly_predict = poly_reg.predict(x_poly_test)
    poly_rmse = mean_squared_error(Y_test, poly_predict, squared=False)
    rmses.append(poly_rmse)
    if min_rmse > poly_rmse:
        min_rmse = poly_rmse
        min_deg = deg

print('Best degree {} with RMSE {}'.format(min_deg, min_rmse))

""" Ploting MSE with degree """
plt.plot(degrees, rmses)
plt.xlabel('Degree')
plt.ylabel('RMSE')
rmses.sort()

# traing started
poly_features = PolynomialFeatures(degree=min_deg, include_bias=False)
X_train = poly_features.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

X_test = poly_features.fit_transform(X_test)
Y_pred = regressor.predict(X_test)

# Support Vector Regression

- kernel
> kernel: {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’

- degree
> Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels

- C
> Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty

- epsilon
> The larger ϵ is, the larger errors you admit in your solution. By contrast, if ϵ→0+, every error is penalized

- gamma
> {‘scale’, ‘auto’} or float, default=’scale’
1. if gamma='scale' (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma,
2. if ‘auto’, uses 1 / n_features.

- max_iter
> Hard limit on iterations within solver, or -1 for no limit

- verbose
> if enabled, may not work properly in a multithreaded context. Controls the verbosity when fitting and predicting

SVR gives us the flexibility to define how much error is acceptable in our model and will find an appropriate line (or hyperplane in higher dimensions) to fit the data

In [None]:
# kernel = rbf
regressor = SVR()
params = {'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 0, 'epsilon': 0.1, 'gamma': 'scale', 
          'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}
regressor.set_params(**params)

# kernel = poly
regressor = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)

regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

# Decision Tree Regressor

- splitter: The strategy used to choose the split at each node. {“best”, “random”}

- min_weight_fraction_leaf: The minimum weight fraction of the sum total of weights required to be at a leaf node

- ccp_alpha: Greater values of ccp_alpha increase the number of nodes pruned

In [None]:
regressor = DecisionTreeRegressor()
params = {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': None, 
          'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 
          'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}
regressor.set_params(**params)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)


# view tree
from sklearn import tree
tree.plot_tree(regressor)

# Random Forest Regressor

- n_estimators
> The number of trees in the forest

- criterion
> The function to measure the quality of a split. "squared_error", "absolute_error", "poisson". default="squared_error"

- max_depth
> The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples

- min_samples_split
> The minimum number of samples required to split an internal node

- min_samples_leaf
> The minimum number of samples required to be at a leaf node. The more you increase the number, the more is the possibility of overfitting

- max_features
> The number of features to consider when looking for the best split.
[“auto”, “sqrt”, “log2”]

- bootstrap
> Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree

- oob_score
> Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True

- max_samples
> If bootstrap is True, the number of samples to draw from X to train each base estimator

In [None]:
regressor = RandomForestRegressor()
params = {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 
          'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 
          'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 
          'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
regressor.set_params(**params)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

# Bagging Regressor

In [None]:
reg = BaggingRegressor(
    base_estimator=SVR(), n_estimators=10, random_state=0
)

reg.fit(X, y)
reg.predict(X)

# Voting Regressor

In [None]:
estimators = [
('r1', LinearRegression()),
('r2', RandomForestRegressor(n_estimators=10, random_state=1))
]

reg = VotingRegressor(estimators=estimators)

reg.fit(X, y)
reg.predict(X)

# Stacking Regressor

In [None]:
estimators = [
    ('lr', RidgeCV()),
    ('svr', LinearSVR(random_state=42))
]

reg = StackingRegressor(
    estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10, random_state=42)
)

reg.fit(X_train, y_train)

# Gradient Boosting Regressor

In [None]:
tree_reg1 = DecisionTreeRegressor(max_depth=2) 
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X) 
tree_reg2 = DecisionTreeRegressor(max_depth=2) 
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2) 
tree_reg3.fit(X, y3)

y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

"""
  OR
"""

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0) 
gbrt.fit(X, y)


"""
    In order to find the optimal number of trees
"""
X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120) 
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)] 
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators) 
gbrt_best.fit(X_train, y_train)

"""
     early stopping
"""
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error 
        error_going_up = 0
    else:
        error_going_up += 1 
        if error_going_up == 5: 
            break

# XGB Regressor [Extreme Gradient Boosting]

> XGBoost is an implementation of Gradient Boosted decision trees

- booster
> gbtree, gblinear or dart

- objective
> binary:logistic, reg:linear

In [None]:
xgb_reg = XGBRegressor(objective ='reg:linear', n_estimators = 10, seed = 123)
xgb_reg.fit(train_X, train_y)

In [None]:
# Train and test set are converted to DMatrix objects,
# as it is required by learning API.
train_dmatrix = xg.DMatrix(data = train_X, label = train_y)
test_dmatrix = xg.DMatrix(data = test_X, label = test_y)
  
# Parameter dictionary specifying base learner
param = {"booster":"gblinear", "objective":"reg:linear"}
  
xgb_r = xg.train(params = param, dtrain = train_dmatrix, num_boost_round = 10)
pred = xgb_r.predict(test_dmatrix)

In [None]:
# read in data
dtrain = xg.DMatrix('demo/data/agaricus.txt.train')
dtest = xg.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
bst = xg.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)

# Adaptive Boosting

#### Params

- base_estimator: If None, then the base estimator is DecisionTreeRegressor initialized with max_depth=1
- algorithm: {‘SAMME’, ‘SAMME.R’} 
If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations

In [None]:
regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regr.fit(X, y)

# LightBGM

In [None]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()
model.fit(X, y)

# Cat Boost


In [None]:
# pip install catboost
from catboost import CatBoostRegressor

categorical_features_indices = np.where(X.dtypes != np.float)[0]

model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')

model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_test, y_test), plot=True)

model.predict(X_test)

In [None]:
import numpy as np
from catboost import Pool, CatBoostRegressor

# initialize data
train_data = np.random.randint(0, 100, size=(100, 10))
train_label = np.random.randint(0, 1000, size=(100))
test_data = np.random.randint(0, 100, size=(50, 10))

# initialize Pool
train_pool = Pool(train_data, train_label, cat_features=[0,2,5])
test_pool = Pool(test_data, cat_features=[0,2,5])

# specify the training parameters 
model = CatBoostRegressor(iterations=2, depth=2, learning_rate=1, loss_function='RMSE')

#train the model
model.fit(train_pool)
preds = model.predict(test_pool)
print(preds)

# ExtraTreesRegressor

> The Extra Trees algorithm works by creating a large number of unpruned decision trees from the training dataset

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

regressor = ExtraTreesRegressor(n_estimators=100, random_state=0)
regressor.fit(X_train, y_train)

# Multivariate Adaptive Regression Splines in Python [MARS]

- Notes: **
1. MARS belongs to the group of regression algorithms used to predict continuous (numerical) target variables
2. The algorithm has two stages: the forward stage and the backward stage

In [None]:
# Install scikit-learn-contrib
!pip install git+https://github.com/scikit-learn-contrib/py-earth@v0.2dev

Collecting git+https://github.com/scikit-learn-contrib/py-earth@v0.2dev
  Cloning https://github.com/scikit-learn-contrib/py-earth (to revision v0.2dev) to /tmp/pip-req-build-y9i0ddr8
  Running command git clone -q https://github.com/scikit-learn-contrib/py-earth /tmp/pip-req-build-y9i0ddr8
  Running command git checkout -b v0.2dev --track origin/v0.2dev
  Switched to a new branch 'v0.2dev'
  Branch 'v0.2dev' set up to track remote branch 'v0.2dev' from 'origin'.


In [None]:
from pyearth import Earth

regressor = Earth()
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

# Principal Components Regression (PCR)

PCR is a regression technique which is widely used when you have many independent variables OR multicollinearity exist in your data. It is divided into 2 steps:
1. Getting the Principal components
2. Run regression analysis on principal components

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 

pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.fit_transform(X_test)

#train PCR model on training data 
regr = LinearRegression()
regr.fit(X_train, y_train)

#calculate RMSE
pred = regr.predict(X_test)
np.sqrt(mean_squared_error(y_test, pred))

#  Partial Least Squares (PLS) Regression

It is an alternative technique of principal component regression when you have independent variables highly correlated. It is also useful when there are a large number of independent variables

In [None]:
from sklearn.cross_decomposition import PLSRegression

X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]
Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]

pls2 = PLSRegression(n_components=2)
pls2.fit(X, Y)

Y_pred = pls2.predict(X)

# Poisson Regression

Poisson regression is used to predict a dependent variable that consists of "count data" given one or more independent variables

Application of Poisson Regression -

1. Predicting the number of calls in customer care related to a particular product
2. Estimating the number of emergency service calls during an event

> poisson regression assumes the variance equal to its mean


In [None]:
from sklearn.linear_model import PoissonRegressor

clf = PoissonRegressor()
X = [[1, 2], [2, 3], [3, 4], [4, 3]]
y = [12, 17, 22, 21]
clf.fit(X, y)

clf.score(X, y)

# Cox Regression


# Tobit Regression

# Negative Binomial Regression

Like Poisson Regression, it also deals with count data. The question arises "how it is different from poisson regression". The answer is negative binomial regression does not assume distribution of count having variance equal to its mean

# Quasi Poisson Regression

It is an alternative to negative binomial regression. It can also be used for overdispersed count data. Both the algorithms give similar results, there are differences in estimating the effects of covariates. The variance of a quasi-Poisson model is a linear function of the mean while the variance of a negative binomial model is a quadratic function of the mean.

# Ordinal Regression

Ordinal Regression is used to predict ranked values. In simple words, this type of regression is suitable when dependent variable is ordinal in nature. 

### Example of ordinal variables
1. Survey responses (1 to 6 scale), patient reaction to drug dose (none, mild, severe)
2. Predicting the movie rating on a scale of 1 to 5 starts can be considered an ordinal regression task

# Score Analysis

In [None]:
# Score
training_score = regressor.score(X_train, Y_train)
test_score = regressor.score(X_test, Y_test)
print("training_score = ", training_score)
print("test_score = ", test_score)

# R2 Score
r_2_score = r2_score(Y_test, Y_pred)
print("r_2_score = ", r_2_score)

# Mean Squared Error
mse = mean_squared_error(Y_test, Y_pred, squared=False)
print("mse = ", mse)

training_score =  0.9226333502184944
test_score =  0.9301891740390946
r_2_score =  0.9301891740390946
mse =  0.221737051360728


# Compare between Actual and Predicted

In [None]:
compare_predict_data = np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1)[0:5]
print(compare_predict_data)

[[1.31 1.  ]
 [0.11 0.  ]
 [2.24 2.  ]
 [1.41 1.  ]
 [1.36 1.  ]]


# K-fold Cross-Validation

- What is Cross-Validation?
>Cross-Validation is essentially a technique used to assess how well a model performs on a new independent dataset.
The simplest example of cross-validation is when you split your data into three groups: training data, validation data, and testing data, where you see the training data to build the model, the validation data to tune the hyperparameters, and the testing data to evaluate your final model

In [None]:
# Way 1
accuracies = cross_val_score(estimator = regressor, X = X_train, y = Y_train, cv = 5)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 91.16 %
Standard Deviation: 2.15 %


In [None]:
# Way 2
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
accuracies = cross_val_score(regressor, X, y, cv=cv, n_jobs=-1)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 91.71 %
Standard Deviation: 4.28 %


## Predict unknown

In [None]:
predict_random_data = regressor.predict([[0, 0, 0, 0, 7777777]])
print("predict is = ", predict_random_data)

# **Model Improvement**  
### Using GridSearchCV and RandomizedSearchCV

1. https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
# GridSearchCV
# Linear Regression
""" [default params]
params = {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False}
"""

""" [All scoring value for Regression]
scoring = [
  explained_variance, max_error, neg_mean_absolute_error, neg_mean_squared_error,
  neg_root_mean_squared_error, neg_mean_squared_log_error, neg_median_absolute_error, r2,
  neg_mean_poisson_deviance, neg_mean_gamma_deviance, neg_mean_absolute_percentage_error
  ]
"""

parameters = {
    'fit_intercept':[True,False], 
    'copy_X':[True, False],
    'positive': [True, False],
    'normalize': [False]
    }
grid_search = GridSearchCV(estimator = regressor, param_grid = parameters, scoring = 'explained_variance', cv = 5)
grid_search.fit(X_train, Y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_
cvres = grid_search.cv_results_

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)
print("Best Estimator:", best_estimator)
print("#"*20)
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(mean_score), params)
print("#"*20)

In [None]:
# GridSearchCV
# Support Vector Regression
""" [default params]
  params = {'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 
          'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}
"""

"""
All scoring value for Regression

scoring = [
  explained_variance, max_error, neg_mean_absolute_error, neg_mean_squared_error,
  neg_root_mean_squared_error, neg_mean_squared_log_error, neg_median_absolute_error, r2,
  neg_mean_poisson_deviance, neg_mean_gamma_deviance, neg_mean_absolute_percentage_error
  ]
"""

# parameters = {
#     'C':[1.0, 2.0, 3.0, 4.0, 5.0],
#     'cache_size':[i for i in range(100, 1000, 100)], 
#     'coef0' : [0.01,10,0.5],
#     'degree': [i for i in range(1, 10)],
#     'gamma' : ('auto','scale'),
#     'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
#     'max_iter': [-1],
#     'shrinking': [True, False],
#     'verbose': [True, False]
#     }
parameters = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear']},
              {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = regressor, param_grid = parameters, scoring = None, cv = 5)
grid_search.fit(X_train, Y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_
cvres = grid_search.cv_results_

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)
print("Best Estimator:", best_estimator)
print("#"*20)
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(mean_score), params)
print("#"*20)

Best Accuracy: 94.31 %
Best Parameters: {'C': 0.75, 'gamma': 0.3, 'kernel': 'rbf'}


In [None]:
# RandomizedSearchCV
# my model
from scipy.stats import randint
parameters = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

random_search = RandomizedSearchCV(estimator=regressor, param_distributions=parameters, cv = 5)
random_search.fit(X_train, Y_train)


# best_accuracy = random_search.best_params_
# best_parameters = random_search.best_score_
# print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
# print("Best Parameters:", best_parameters)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_
cvres = grid_search.cv_results_

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)
print("Best Estimator:", best_estimator)
print("#"*20)
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(mean_score), params)
print("#"*20)