# ML Workflow - Supervised Learning (Extra Tools)

![Image](./img/scikit_learn.png)


In [None]:
# imports 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## [Ensemble methods](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble)

__Boosting methods:__ combine several weak models to produce a powerful ensemble.

- [GradientBoostingRegressor()](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)

__Averaging methods:__ the driving principle is to build several estimators independently and then to average their predictions.

- [RandomForestRegressor()](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

In [None]:
# Load datasets

X, y = datasets.load_diabetes(return_X_y=True)
#X, y = datasets.fetch_california_housing(return_X_y=True)
print(X.shape, y.shape)

In [None]:
#Target analysis

ax = pd.Series(y).plot.box(figsize=(5, 8))
ax.set_ylabel('Target')
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
%%time

model = LinearRegression()

#model = GradientBoostingRegressor(random_state = 42)

#model = RandomForestRegressor(random_state = 42)


model.fit(X, y)

y_pred = model.predict(X)

hyperparameters = model.get_params()

print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

---

## [Cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html)

![Image](./img/cross_validation.jpeg)

In [None]:
%%time

#model = LinearRegression()

model = GradientBoostingRegressor(random_state = 42)

#model = RandomForestRegressor(random_state = 42)

scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5,
                         n_jobs=-1)

print(type(model), '\n')
print(scores, '\n')
print(np.mean(-scores), '\n')

__Train, Test, Split...__

In [None]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#model_rmse = LinearRegression()

model_rmse = GradientBoostingRegressor(random_state = 42)

#model_rmse = RandomForestRegressor(random_state = 42)


model_rmse.fit(X_train, y_train)
y_pred = model_rmse.predict(X_test)

hyperparameters = model_rmse.get_params()

rmse = mean_squared_error(y_test, y_pred)**0.5

print(type(model_rmse), '\n')
print(rmse, '\n')

---

## [GridSearchCV()](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

Alternatively, you may use [RandomizedSearchCV()](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) if you have limited resources.

In [None]:
# Gradient Boosting Regressor

param_grid = {'n_estimators': [100, 1000],  # Number of boosting stages to be run.
              'learning_rate': [0.01, 0.1],  # Rate at which the contribution of each tree is shrunk.
              'max_depth': [None, 3, 10],  # Maximum depth of the individual regression estimators.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # The number of features to consider when looking for the best split.
              }

In [None]:
# Random Forest Regressor

param_grid = {'n_estimators': [100, 200, 300],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }

In [None]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [None]:
%%time

grid_search.fit(X,y)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

---