# In the notebook I will show how I iterate over multiple 
algorithms utilizing python's sklearn library. In the "Basic" section, I will iterate though each algorithm and nothing more.  Afterwards I will scale the data and then apply principle component analysis.
<br>
<br>
Inspired by DataCamp's Hyperparameter Tuning in Python Chapter 2 pages 18+

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor


from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
df1 = pd.read_csv('boston_housing.csv', usecols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',
                                                   'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO',
                                                   'B', 'LSTAT', 'target'])

In [3]:
df1.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
X = df1.drop(columns = ['target'])
y = df1['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=243)

# Basic

In [6]:
models = {
          'RandomForestRegressor' : RandomForestRegressor(max_features='sqrt'), 
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'LinearRegression' : LinearRegression(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'SGDRegressor' : SGDRegressor(max_iter = 2000)
         }

hyperparameters = {
                   'RandomForestRegressor' : 
                                            {'max_depth': [6, 8, 10, 12], 'min_samples_leaf': [1, 2, 4, 6],
                                            'n_estimators' : [165, 175, 177, 180,]},
                   'GradientBoostingRegressor' : 
                                           {"max_depth": [2, 4, 6, 8], "min_samples_split": [3, 5, 8, 10],
                                            "learning_rate": [0.001, 0.01, 0.1, 0.15, 0.25]},
                   'LinearRegression' : {'fit_intercept' : [True, False], 'positive' : [True, False]},
    
                   'DecisionTreeRegressor' : 
                                            {'max_depth': [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 4, 6],},
    
                   'SGDRegressor' : {'alpha' : [0.15, 0.25, 0.30, 0.35], 
                                     'learning_rate' :['constant', 'optimal', 'invscaling', 'adaptive'],
                                      'penalty': ['l2', 'l1', 'elasticnet', None],}
            
                  }

In [7]:
print('Algorithm, best hyperparameters, r-squared:')
i = 0
while i < len(list(models.values())):
    
    grid_rf_regression = GridSearchCV(
    estimator = list(models.values())[i],
    param_grid = list(hyperparameters.values())[i],
    #scoring='neg_mean_squared_error',
    scoring='r2',
    n_jobs=4,
    cv = 3,
    refit=True,
    return_train_score=True)
    
    grid_rf_regression.fit(X_train, y_train)
    #display(grid_rf_regression.get_params())
    
    print(list(models.keys())[i].upper(),':',grid_rf_regression.best_params_,grid_rf_regression.best_score_)
    
    i += 1

Algorithm, best hyperparameters, r-squared:
RANDOMFORESTREGRESSOR : {'max_depth': 12, 'min_samples_leaf': 1, 'n_estimators': 180} 0.8247038461819008
GRADIENTBOOSTINGREGRESSOR : {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 3} 0.8431710173025984
LINEARREGRESSION : {'fit_intercept': True, 'positive': False} 0.6864013112944148
DECISIONTREEREGRESSOR : {'max_depth': 8, 'min_samples_leaf': 2} 0.7014689877766709
SGDREGRESSOR : {'alpha': 0.3, 'learning_rate': 'adaptive', 'penalty': 'l2'} -4.5017376544305714e+21


In [8]:
#display(DecisionTreeRegressor().get_params().keys())
#display(DecisionTreeRegressor().get_params())
#display(grid_rf_regression.get_params())

# Scaling

In [9]:
scaler = StandardScaler()
models = {
          'RandomForestRegressor' : Pipeline([('scaler', scaler),
                                              ('RandomForestRegressor', RandomForestRegressor(max_features='sqrt'))],),
          'GradientBoostingRegressor' : Pipeline([('scaler', scaler),
                                                  ('GradientBoostingRegressor', GradientBoostingRegressor())]),
          'LinearRegression' : Pipeline([('scaler', scaler),
                                         ('LinearRegression', LinearRegression())]),
          'DecisionTreeRegressor' : Pipeline([('scaler', scaler),
                                              ('DecisionTreeRegressor', DecisionTreeRegressor())]),
          'SGDRegressor' :  Pipeline([('scaler', scaler),
                                      ('SGDRegressor', SGDRegressor())])
          }

hyperparameters = {
                   'RandomForestRegressor' : 
                                            {'RandomForestRegressor__max_depth': [6, 8, 10, 12],
                                             'RandomForestRegressor__min_samples_leaf': [1, 2, 4, 6],
                                            'RandomForestRegressor__n_estimators' : [165, 175, 177, 180,]},
                   'GradientBoostingRegressor' : 
                                           {"GradientBoostingRegressor__max_depth": [2, 4, 6, 8],
                                            "GradientBoostingRegressor__min_samples_split": [3, 5, 8, 10],
                                            "GradientBoostingRegressor__learning_rate": [0.001, 0.01, 0.1, 0.15, 0.25]},
                   'LinearRegression' : 
                                        {'LinearRegression__fit_intercept' : [True, False],
                                          'LinearRegression__positive' : [True, False]},
                   'DecisionTreeRegressor' : 
                                            {'DecisionTreeRegressor__max_depth': [2, 4, 6, 8],
                                             'DecisionTreeRegressor__min_samples_leaf': [1, 2, 4, 6],},
                   'SGDRegressor' : 
                                    {'SGDRegressor__alpha' : [0.15, 0.25, 0.30, 0.35], 
                                     'SGDRegressor__learning_rate' :['constant', 'optimal', 'invscaling', 'adaptive'],
                                      'SGDRegressor__penalty': ['l2', 'l1', 'elasticnet', None],}
                    }

In [10]:
print('Algorithm, best hyperparameters, r-squared:')
i = 0
while i < len(list(models.values())):
    
    grid_rf_regression = GridSearchCV(
    estimator = list(models.values())[i],
    param_grid = list(hyperparameters.values())[i],
    #scoring='neg_mean_squared_error',
    scoring='r2',
    n_jobs=4,
    cv = 3,
    refit=True,
    return_train_score=True)
    
    grid_rf_regression.fit(X_train, y_train)
    #display(grid_rf_regression.get_params())
    
    print(list(models.keys())[i].upper(),':',grid_rf_regression.best_params_,grid_rf_regression.best_score_)
    print('  ')
    i += 1

Algorithm, best hyperparameters, r-squared:
RANDOMFORESTREGRESSOR : {'RandomForestRegressor__max_depth': 8, 'RandomForestRegressor__min_samples_leaf': 1, 'RandomForestRegressor__n_estimators': 175} 0.826227448699833
  
GRADIENTBOOSTINGREGRESSOR : {'GradientBoostingRegressor__learning_rate': 0.1, 'GradientBoostingRegressor__max_depth': 4, 'GradientBoostingRegressor__min_samples_split': 8} 0.843835347505636
  
LINEARREGRESSION : {'LinearRegression__fit_intercept': True, 'LinearRegression__positive': False} 0.6864013112944152
  
DECISIONTREEREGRESSOR : {'DecisionTreeRegressor__max_depth': 8, 'DecisionTreeRegressor__min_samples_leaf': 1} 0.7417983886289493
  
SGDREGRESSOR : {'SGDRegressor__alpha': 0.35, 'SGDRegressor__learning_rate': 'adaptive', 'SGDRegressor__penalty': None} 0.6877920559477668
  


# Scaler and Principle Component Analysis (PCA)

In [11]:
scaler = StandardScaler()
pca = PCA()

models = {
          'RandomForestRegressor' : Pipeline([('scaler', scaler),
                                              ('pca', pca),
                                              ('RandomForestRegressor', RandomForestRegressor(max_features='sqrt'))],),
          'GradientBoostingRegressor' : Pipeline([('scaler', scaler),
                                                  ('pca', pca),
                                                  ('GradientBoostingRegressor', GradientBoostingRegressor())]),
          'LinearRegression' : Pipeline([('scaler', scaler),
                                         ('pca', pca),
                                         ('LinearRegression', LinearRegression())]),
          'DecisionTreeRegressor' : Pipeline([('scaler', scaler),
                                              ('pca', pca),
                                              ('DecisionTreeRegressor', DecisionTreeRegressor())]),
          'SGDRegressor' :  Pipeline([('scaler', scaler),
                                      ('pca', pca),
                                      ('SGDRegressor', SGDRegressor())]),
          }

hyperparameters = {
                   'RandomForestRegressor' : 
                                            {
                                             'pca__n_components' : np.arange(1, len(X.columns) + 1, 1),
                                             'RandomForestRegressor__max_depth': [6, 8, 10, 12],
                                             'RandomForestRegressor__min_samples_leaf': [1, 2],
                                             'RandomForestRegressor__n_estimators' : [165, 175, ],
                                            },
                   'GradientBoostingRegressor' : 
                                           {'pca__n_components' : np.arange(1, len(X.columns) + 1, 1),
                                            'GradientBoostingRegressor__max_depth': [2, 4, 6, 8],
                                            'GradientBoostingRegressor__min_samples_split': [3, 5, 8, 10],
                                            'GradientBoostingRegressor__learning_rate': [ 0.01, 0.1, 0.15,],
                                           },
                   'LinearRegression' : 
                                        {
                                          'pca__n_components' : np.arange(1, len(X.columns) + 1, 1),
                                          'LinearRegression__fit_intercept' : [True, False],
                                          'LinearRegression__positive' : [True, False],
                                        },
                   'DecisionTreeRegressor' : 
                                            {
                                             'pca__n_components' : np.arange(1, len(X.columns) + 1, 1),
                                             'DecisionTreeRegressor__max_depth': [2, 4, 6, 8],
                                             'DecisionTreeRegressor__min_samples_leaf': [1, 2, 4, 6],
                                            },
                   'SGDRegressor' : 
                                    {
                                     'pca__n_components' : np.arange(1, len(X.columns) + 1, 1),
                                     'SGDRegressor__alpha' : [0.15, 0.25, 0.30, 0.35], 
                                     'SGDRegressor__learning_rate' :['constant', 'optimal', 'invscaling', 'adaptive'],
                                     'SGDRegressor__penalty': ['l2', 'l1', 'elasticnet', None],
                                    }
                    }

In [12]:
print('Algorithm, best hyperparameters, r-squared:')
i = 0
while i < len(list(models.values())):
    
    grid_rf_regression = GridSearchCV(
    estimator = list(models.values())[i],
    param_grid = list(hyperparameters.values())[i],
    #scoring='neg_mean_squared_error',
    scoring='r2',
    n_jobs=4,
    cv = 3,
    refit=True,
    return_train_score=True)
    
    grid_rf_regression.fit(X_train, y_train)
    #display(grid_rf_regression.get_params())
    
    print(list(models.keys())[i].upper(),':',grid_rf_regression.best_params_,grid_rf_regression.best_score_)
    print('  ')
    i += 1

Algorithm, best hyperparameters, r-squared:
RANDOMFORESTREGRESSOR : {'RandomForestRegressor__max_depth': 12, 'RandomForestRegressor__min_samples_leaf': 1, 'RandomForestRegressor__n_estimators': 165, 'pca__n_components': 12} 0.7396813003224555
  
GRADIENTBOOSTINGREGRESSOR : {'GradientBoostingRegressor__learning_rate': 0.1, 'GradientBoostingRegressor__max_depth': 4, 'GradientBoostingRegressor__min_samples_split': 10, 'pca__n_components': 12} 0.7943943739600443
  
LINEARREGRESSION : {'LinearRegression__fit_intercept': True, 'LinearRegression__positive': False, 'pca__n_components': 13} 0.6864013112944151
  
DECISIONTREEREGRESSOR : {'DecisionTreeRegressor__max_depth': 8, 'DecisionTreeRegressor__min_samples_leaf': 2, 'pca__n_components': 7} 0.5889123570638041
  
SGDREGRESSOR : {'SGDRegressor__alpha': 0.3, 'SGDRegressor__learning_rate': 'adaptive', 'SGDRegressor__penalty': None, 'pca__n_components': 13} 0.6874868823240415
  


# The tree based estimators - xgbooster, (not in this notebook) decision tree, and random forest - often do not benefit from scaling and pca, which I why I prefer them.  Nonetheless, if you wish you use an algorithm that requires scaling or pca, you can use the code in this notebook as a guide.<br> Below is an example of how I might instigate a well performing above. 

In [13]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=3)),
                 ('regressor', RandomForestRegressor())])

In [14]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('reducer', PCA(n_components=3)),
                ('regressor', RandomForestRegressor())])

In [15]:
print(pipe.score(X_test, y_test))

0.7161943144071752
