In [256]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV
from sklearn.metrics import SCORERS, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go
from catboost import CatBoostRegressor, Pool

import numpy as np
from math import sqrt
pd.set_option("display.max_columns", 999)

# ML Regression techniques

## Regression problem

#### History :)
The term "regression" was coined by Francis Galton in the nineteenth century to describe a biological phenomenon. The phenomenon was that the heights of descendants of tall ancestors tend to regress down towards a normal average (a phenomenon also known as regression toward the mean).

#### Definition
Regression predictive modeling is the task of approximating a mapping function from input variables to a numerical output variable.

#### Most common regression models
* Linear Regression
    * Lasso
    * Ridge
    * ElasticNet
* Generalized Linear Regression
    * Logistic
    * Poisson
* Decision Trees
* Ensemble methods
    * Random Forests
    * Gradient Boosting
* Neural Networks
* Bayesian Regression
* SVM

# High Variance vs. High Bias problem

## High Bias
Bias is the algorithm’s tendency to consistently learn the wrong thing by not taking into account all the information in the data (underfitting)

#### Proposals
* Try more complex model.
* Add features.

## High Variance
Variance is the algorithm’s tendency to learn random things irrespective of the real signal by fitting highly flexible models that follow the error/noise in the data too closely (overfitting).

#### Proposals
* Try simpler model.
* Remove / reduce some features.
* Add regularization parameters.


<img src="img/Bias-Variance-Tradeoff-In-Machine-Learning.png" style="height:600px">

## ElasticNet Model a.k.a. From LM to ML

### Motivation
When we have lots of features, we want to be able to penalize their size and number and thus balance the Bias/Variance problem. 

Ideally we want to do this by some neat parametrization.


### Regularized loss function !

$L(\hat{y}, y)_{reg} = L(\hat{y}, y) + \lambda * R(\beta)$.

##### The loss function for ElasticNet regression is
$L(\hat{y}, y)_{lasso} = \sum_{p}{(\hat{y}-y)^2} + \lambda * \big[(1-\alpha)/2\sum_{p}{|\beta|} + \alpha \sum_{p}{\beta ^2}\big]$, where $\lambda \in N, \alpha \in [0, 1]$.

<img src="img/lasso_vs_ridge.png" />

## But how is $\lambda$ set? => CrossValidation!

<img src="img/cv_mse.png" />

# Back to the code

In [12]:
# define Root MSE function
def rmse(x, y):
    return sqrt(mean_squared_error(x, y))

In [72]:
# load the dataset into a Pandas dataframe
df = pd.read_csv('data/attrition.csv')

# drop columns with 
df.drop(['EmployeeNumber', 'Attrition', 'Over18', 'StandardHours', 'EmployeeCount'], axis=1, inplace=True)

In [73]:
# what columns fo we have
df.columns

Index(['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [74]:
# lets have a look at a scatter matrix plot
inspect_cols = ['MonthlyIncome','EducationField',
       'JobLevel', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

# fig = px.scatter_matrix(df[inspect_cols])
# # increase the resolution
# fig.update_layout(
#     autosize=False,
#     width=2*1024,
#     height=2*720,
#     paper_bgcolor="LightSteelBlue",
# )
# #plot the scatter matrix plot
# fig.show()

In [75]:
# lets make it some fun
df.drop('JobLevel', axis=1, inplace=True)

In [76]:
# select response variable and features
target_col_name = 'MonthlyIncome'
num_feature_cols = [
        'Age', 'DailyRate','DistanceFromHome', 'Education',
        'HourlyRate', 'EnvironmentSatisfaction', 'JobInvolvement',
        'JobSatisfaction', 'NumCompaniesWorked', 'PercentSalaryHike',
        'RelationshipSatisfaction', 'StockOptionLevel', 'PerformanceRating',
        'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
        'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
        'YearsWithCurrManager', 'MonthlyRate']
cat_feature_cols = [x for x in df.columns if x not in num_feature_cols and x not in [target_col_name]]

In [77]:
# cast numerical columns as float
for col in num_feature_cols:
    df[col] = df[col].astype(float)

In [78]:
# create target array and numeric features dataframe
df_target = np.ravel(df[[target_col_name]])
df_features = df[num_feature_cols]

In [30]:
# ALTERNATIVE REALITY
from sklearn.datasets import load_boston
boston = load_boston()
data = pd.DataFrame(boston.data,columns=boston.feature_names)
data = pd.concat([data,pd.Series(boston.target,name='MEDV')],axis=1)

df_target = np.ravel(data.iloc[:,-1])
df_features = data.iloc[:,:-1]

In [80]:
# split the dataframe to train and test parts
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.3, random_state=666)

In [87]:
# fit simple linear regression
m0 = LinearRegression().fit(X_train, y_train)
m0_rmse = rmse(m0.predict(X_test), y_test)
m0_r2 = m0.score(X_test, y_test)

print(f"Model rmse: {m0_rmse} \n")
print(f"Model R2: {m0_r2} \n")
print(f"Model coefs: {[int(x) for x in m0.coef_]}")

Model rmse: 2969.246488009331 

Model R2: 0.5913292923231421 

Model coefs: [-22, 0, -14, -48, 0, -55, 29, 76, -9, 47, 94, 19, -844, 470, 15, 192, 45, 25, 53, -121, 0]


In [96]:
# lets try elastic net
m1 = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train, y_train)
m1_rmse = rmse(m1.predict(X_test), y_test)
hyper_pars = {'alpha': m1.alpha, 'l1_ratio': m1.l1_ratio}
m1_r2 = m1.score(X_test, y_test)


print(f"Model rmse: {m1_rmse} \n")
print(f"Model R2: {m1_r2} \n")
print(f"Model coefs: {[int(x) for x in m1.coef_]} \n")
print(f"Model hyper_params: {hyper_pars}")

Model rmse: 2955.164168268846 

Model R2: 0.5951965255292877 

Model coefs: [-17, 0, -14, -27, -1, -36, 19, 56, -3, -11, 65, 11, -79, 456, 12, 89, 51, 21, 50, -109, 0] 

Model hyper_params: {'alpha': 1, 'l1_ratio': 0.5}


# Cross validation - hyperparameter tunning

### K-Fold CV

<img src="img/kfold.png" />

### GridSearch CV Workflow

<img src="img/grid_search_workflow.png" />

In [114]:
# gridsearch through l1_ratio and alpha parameters grid
param_grid = {'l1_ratio': [.0001, .05, .1, .5, .7, .9, .95, 1], 'alpha':[.05, .1, .2, .5, 1, 5, 10, 20, 50, 100, 500]}
m2 = GridSearchCV(estimator=ElasticNet(), param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m2_mse = rmse(m2.predict(X_test), y_test)
m2_mae = mean_absolute_error(m2.predict(X_test), y_test)
m2_r2 = m2.score(X_test, y_test)


print(f"Model mse: {m2_mse} \n")
print(f"Model mae: {m2_mae} \n")
print(f"Model R2: {m2_r2} \n")
print(f"Model coefs: {[round(x,2) for x in m2.best_estimator_.coef_]} \n")
print(f"Model hyper_params: {m2.best_params_}")

Model mse: 2957.2425960481037 

Model mae: 2215.8946136811255 

Model R2: 0.594626912059697 

Model coefs: [-1.73, -0.08, -7.47, -0.0, -0.53, -0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, 446.42, 0.0, 0.0, 21.03, 0.0, 0.0, -0.0, 0.0] 

Model hyper_params: {'alpha': 500, 'l1_ratio': 1}


In [115]:
# printout coefs with their names
dict(zip(X_train.columns, m2.best_estimator_.coef_))

{'Age': -1.73180901312059,
 'DailyRate': -0.08089662608033568,
 'DistanceFromHome': -7.469024425323716,
 'Education': -0.0,
 'HourlyRate': -0.5339916377695538,
 'EnvironmentSatisfaction': -0.0,
 'JobInvolvement': 0.0,
 'JobSatisfaction': 0.0,
 'NumCompaniesWorked': -0.0,
 'PercentSalaryHike': -0.0,
 'RelationshipSatisfaction': 0.0,
 'StockOptionLevel': 0.0,
 'PerformanceRating': -0.0,
 'TotalWorkingYears': 446.42471528737127,
 'TrainingTimesLastYear': 0.0,
 'WorkLifeBalance': 0.0,
 'YearsAtCompany': 21.028429421179982,
 'YearsInCurrentRole': 0.0,
 'YearsSinceLastPromotion': 0.0,
 'YearsWithCurrManager': -0.0,
 'MonthlyRate': 0.0037122224639382887}

In [116]:
# buildin method for CV of elasticnet
m2 = ElasticNetCV(cv=5, l1_ratio=[.1, .5, .7, .9, .95, .99, 1]).fit(X_train, y_train)

hyper_params = {'alpha': m2.alpha_, 'l1_ratio': m2.l1_ratio_}
m2_mse = rmse(m2.predict(X_test), y_test)
m2_r2 = m2.score(X_test, y_test)

print(f"Model mse: {m2_mse} \n")
print(f"Model R2: {m2_r2} \n")
print(f"Model coefs: {[round(x,2) for x in m2.coef_]} \n")
print(f"Model hyper_params: {hyper_params}")

Model mse: 2963.4902447293634 

Model R2: 0.5929122715810862 

Model coefs: [-0.0, -0.08, -0.88, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, 442.35, 0.0, 0.0, 12.85, 0.0, 0.0, -0.0, 0.0] 

Model hyper_params: {'alpha': 942.4287461377609, 'l1_ratio': 1.0}


## Transformers & Pipelines

### a.k.a. how to do some feture engeneering and not go crazy through CV

<img src="img/pipeline-diagram.png" />

**Note: The pipeline object behaves just like any other estimator object. Which is cool.**

#### OK, lets add some categorical features!

In [307]:
# create target array and features dataframe
df_target = np.ravel(df[[target_col_name]])
df_features_all = df[num_feature_cols + cat_feature_cols]

# split the dataframe to train and test parts
X_train, X_test, y_train, y_test = train_test_split(df_features_all, df_target, test_size=0.3, random_state=666)

In [308]:
column_transformer = ColumnTransformer([('onehot', OneHotEncoder(), cat_feature_cols)], remainder='passthrough')
estimators = [('col_trans', column_transformer), ('reg', LinearRegression())]

pipe0 = Pipeline(estimators)
pipe0.fit(X_train, y_train)

pipe0_mse = rmse(pipe0.predict(X_test), y_test)
pipe0_mae = mean_absolute_error(pipe0.predict(X_test), y_test)
pipe0_r2 = pipe0.score(X_test, y_test)

print(f"Model mse: {pipe0_mse} \n")
print(f"Model mae: {pipe0_mae} \n")
print(f"Model R2: {pipe0_r2} \n")
print(f"Model coefs: {[int(x) for x in pipe0.named_steps['reg'].coef_]} \n")

Model mse: 1704.311210211338 

Model mae: 1352.1166486490117 

Model R2: 0.8653585412927676 

Model coefs: [-61, 25, 35, 32, 431, -463, 527, -76, -37, -227, -129, -57, -60, 60, -677, -2508, -3640, 7337, -467, 6367, -3730, 254, -2935, -133, 35, 97, -25, 25, -13, 0, -1, -71, -1, 19, -88, 49, 18, 37, 68, 40, -534, 194, -21, 20, 37, -28, 59, -68, 0] 



In [309]:
# lets try on default ElasticNet model
column_trans = ColumnTransformer([('onehot', OneHotEncoder(dtype='int'), cat_feature_cols)], remainder='passthrough')
estimators = [('column_trans', column_trans), ('reg', ElasticNet())]

pipe0 = Pipeline(estimators)
pipe0.fit(X_train, y_train)

pipe0_mse = rmse(pipe0.predict(X_test), y_test)
pipe0_mae = mean_absolute_error(pipe0.predict(X_test), y_test)
pipe0_r2 = pipe0.score(X_test, y_test)
hyper_pars = {'alpha': pipe0.named_steps['reg'].alpha, 'l1_ratio': pipe0.named_steps['reg'].l1_ratio}

print(f"Model mse: {pipe0_mse} \n")
print(f"Model mae: {pipe0_mae} \n")
print(f"Model R2: {pipe0_r2} \n")
print(f"Model coefs: {[int(x) for x in pipe0.named_steps['reg'].coef_]} \n")
print(f"Model hyper_params: {hyper_pars}")

Model mse: 2750.6389022310195 

Model mae: 2027.2521952746026 

Model R2: 0.6492899843206599 

Model coefs: [29, -75, 45, 0, -82, 83, 39, 0, 37, -31, -10, -35, -4, 4, -55, -65, -430, 561, 40, 501, -486, 69, -134, 15, 26, -42, 19, -19, -17, 0, -13, -30, -1, -28, 24, 58, -1, -9, 66, 0, -73, 434, 6, 70, 51, 11, 49, -107, 0] 

Model hyper_params: {'alpha': 1.0, 'l1_ratio': 0.5}


In [306]:
# now we add some imputers and do the CV
num_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='median'))])

cat_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='most_frequent')),
                  ('onehot', OneHotEncoder(categories='auto', 
                                     sparse=False, 
                                     handle_unknown='ignore'))])

pipeline_preprocess = ColumnTransformer(transformers=[
        ('numerical_preprocessing', num_transformer, num_feature_cols),
        ('categorical_preprocessing', cat_transformer, cat_feature_cols)],
        remainder='passthrough')

pipe0 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", ElasticNet())])


param_grid = {'reg__l1_ratio': [.1, .5, .7, .9, .95, 1], 'reg__alpha':[0.2, 0.5, 1, 4, 10, 20, 40, 100, 500, 800]}
m3 = GridSearchCV(estimator=pipe0, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m3_mse = rmse(m3.predict(X_test), y_test)
m3_mae = mean_absolute_error(m3.predict(X_test), y_test)
m3_r2 = m3.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m3_mse} \n")
print(f"Model mae: {m3_mae} \n")
print(f"Model R2: {m3_r2} \n")
print(f"Model coefs: {[int(x) for x in m3.best_estimator_.named_steps['reg'].coef_]} \n")
print(f"Model hyper_params: {m3.best_params_}")

Model mse: 1676.9665920128036 

Model mae: 1310.1494694430319 

Model R2: 0.8754858756260295 

Model coefs: [-6, 0, 4, -19, 0, -24, -97, 37, 0, 12, 21, 9, -406, 213, -39, 82, 44, -30, 35, -45, 0, 0, 0, 0, -98, 246, 0, 0, 0, 54, -85, 0, 0, -3, 0, 0, -1444, -2940, 7005, 0, 6920, -2910, 61, -2508, 0, -55, 0, 0, 0] 

Model hyper_params: {'reg__alpha': 10, 'reg__l1_ratio': 1}


In [301]:
# added scaler
num_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='median')),
                  ('scaler', RobustScaler())])

cat_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='most_frequent')),
                  ('onehot', OneHotEncoder(categories='auto', 
                                     sparse=False, 
                                     handle_unknown='ignore'))])

pipeline_preprocess = ColumnTransformer(transformers=[
        ('numerical_preprocessing', num_transformer, num_feature_cols),
        ('categorical_preprocessing', cat_transformer, cat_feature_cols)],
        remainder='passthrough')

pipe0 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", ElasticNet())])


param_grid = {'reg__l1_ratio': [.1, .5, .7, .9, .95, 1], 'reg__alpha':[0.2, 0.5, 1, 4, 10, 20, 40, 100]}
m4 = GridSearchCV(estimator=pipe0, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m4_mse = rmse(m4.predict(X_test), y_test)
m4_mae = mean_absolute_error(m4.predict(X_test), y_test)
m4_r2 = m4.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m4_mse} \n")
print(f"Model mae: {m4_mae} \n")
print(f"Model R2: {m4_r2} \n")
print(f"Model coefs: {[int(x) for x in m4.best_estimator_.named_steps['reg'].coef_]} \n")
print(f"Model hyper_params: {m4.best_params_}")

Model mse: 1680.9220482471592 

Model mae: 1326.175268989353 

Model R2: 0.874897800689984 

Model coefs: [0, 0, 7, 0, 0, 0, -89, 36, 0, 0, 0, 0, -242, 1929, -34, 54, 149, -40, 61, -107, 0, 0, 0, 0, 0, 106, 0, 0, 0, 0, -21, 0, 0, 0, 0, 0, -1274, -2777, 6806, 0, 6776, -2746, 0, -2273, 0, -16, 0, 0, 0] 

Model hyper_params: {'reg__alpha': 20, 'reg__l1_ratio': 1}


## Decision trees - bricks of ensemble models

#### What is it?
A model that consists of k nodes with binary decision rule. 

#### How does it predict?
1] Is your weight >= 90 -> True

2] Is your level of excersise >= 0 -> False

3] Do you have stressful job -> True

--> You will die at np.mean([63, 82, 54, 61, 64]) = 64.8

#### How is it trained?

1] Fow each candidate split $\theta$ (across all fetures), at node with $Q$ remaining data points, scoring function $G(Q, \theta)$ is computed as:

$G(Q, \theta) = \frac{n_{left}}{N_m} H(Q_{left}(\theta)) + \frac{n_{right}}{N_m} H(Q_{right}(\theta))$ 

Where $Q_{left}$ represents the left side of the node. The inpurity function $H()$ is defined as MSE for regression task.

2] You select $\theta^* = argmin_{\theta} G(Q, \theta)$ as your next split.

This is repeated until there is nothing to split or stopping criterion is hit.

#### Why is it so bad?
The decision trees tend to overfit.

<img src="img/tree_fit.png"/>

In [300]:
num_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='median')),
                  ('scaler', RobustScaler())])

cat_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='most_frequent')),
                  ('onehot', OneHotEncoder(categories='auto', 
                                     sparse=False, 
                                     handle_unknown='ignore'))])

pipeline_preprocess = ColumnTransformer(transformers=[
        ('numerical_preprocessing', num_transformer, num_feature_cols),
        ('categorical_preprocessing', cat_transformer, cat_feature_cols)],
        remainder='passthrough')

pipe1 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", DecisionTreeRegressor())])

param_grid = {'reg__max_depth': [3, 5]}
m5 = GridSearchCV(estimator=pipe1, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m5_mse = rmse(m5.predict(X_test), y_test)
m5_mae = mean_absolute_error(m5.predict(X_test), y_test)
m5_r2 = m5.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m5_mse} \n")
print(f"Model mae: {m5_mae} \n")
print(f"Model R2: {m5_r2} \n")
print(f"Model hyper_params: {m5.best_params_}")

Model mse: 1780.9897028884568 

Model mae: 1333.0034782136986 

Model R2: 0.8595594200487755 

Model hyper_params: {'reg__max_depth': 5}


In [131]:
import graphviz 
dot_data = export_graphviz(m4.best_estimator_.named_steps['reg'], out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("img/tree_example")

'img/tree_example.pdf'

<img src="img/tree_example.pdf"/>

## Ensemble models - Random Forests

<img src="img/rf.png"/>

In [299]:
num_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='median')),
                  ('scaler', RobustScaler())])

cat_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='most_frequent')),
                  ('onehot', OneHotEncoder(categories='auto', 
                                     sparse=False, 
                                     handle_unknown='ignore'))])

pipeline_preprocess = ColumnTransformer(transformers=[
        ('numerical_preprocessing', num_transformer, num_feature_cols),
        ('categorical_preprocessing', cat_transformer, cat_feature_cols)],
        remainder='passthrough')

pipe2 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", RandomForestRegressor())])

param_grid = {'reg__max_depth': [5, 10, 15], 'reg__n_estimators':[40, 100, 150], 'reg__max_features':['auto', 'sqrt']}
m6 = GridSearchCV(estimator=pipe2, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m6_mse = rmse(m6.predict(X_test), y_test)
m6_mae = mean_absolute_error(m6.predict(X_test), y_test)
m6_r2 = m6.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m6_mse} \n")
print(f"Model mae: {m6_mae} \n")
print(f"Model R2: {m6_r2} \n")
print(f"Model hyper_params: {m6.best_params_}")

Model mse: 1559.5982142514106 

Model mae: 1204.6771158761585 

Model R2: 0.892305070653728 

Model hyper_params: {'reg__max_depth': 10, 'reg__max_features': 'auto', 'reg__n_estimators': 100}


## Ensemble models - Gradient Boosting

#### Weak learner -> errors -> Weak learner -> errors -> ...

If we define weak learner in $m$-th iteration step as $h_m$ and final model in step $m$ as $F_m$, then the iteration formula is:

$F_m(X) = F_{m-1}(X) + \lambda h_m(X)$,

where $X$ are the data and $\lambda$ is learning raste coefficient. Initial model $F_0$ is just weak learner on the data $X$.

<img src="img/xgb.png"/>

In [259]:
num_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='median')),
                  ('scaler', RobustScaler())])

cat_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='most_frequent')),
                  ('onehot', OneHotEncoder(categories='auto', 
                                     sparse=False, 
                                     handle_unknown='ignore'))])

pipeline_preprocess = ColumnTransformer(transformers=[
        ('numerical_preprocessing', num_transformer, num_feature_cols),
        ('categorical_preprocessing', cat_transformer, cat_feature_cols)],
        remainder='passthrough')

pipe3 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", GradientBoostingRegressor())])

param_grid = {'reg__max_features': ['auto', 'sqrt'], 'reg__subsample': [0.1, 0.05, 0.4], 'reg__min_samples_leaf': [0.0025, 0.005, 0.01, 0.05, 0.1], 'reg__n_estimators':[30, 40, 50, 70, 200]}
m7 = GridSearchCV(estimator=pipe3, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m7_mse = rmse(m7.predict(X_test), y_test)
m7_mae = mean_absolute_error(m7.predict(X_test), y_test)
m7_r2 = m7.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m7_mse} \n")
print(f"Model mae: {m7_mae} \n")
print(f"Model R2: {m7_r2} \n")
print(f"Model hyper_params: {m7.best_params_}")

Model mse: 1594.477921602041 

Model mae: 1238.4074915812876 

Model R2: 0.8874341079739063 

Model hyper_params: {'reg__max_features': 'auto', 'reg__min_samples_leaf': 0.0025, 'reg__n_estimators': 50, 'reg__subsample': 0.4}


## Feature importance

In [272]:
# get onehot column names and appropriate feture importances from the pipeline
onehot_columns = m7.best_estimator_.named_steps['transform_inputs'].named_transformers_['categorical_preprocessing'].named_steps['onehot'].get_feature_names(input_features=cat_feature_cols)
feature_importance = pd.DataFrame({'features': np.array(num_feature_cols + list(onehot_columns)), 'importance': m7.best_estimator_.named_steps['reg'].feature_importances_})

In [297]:
# plot the feature importance
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
fig = px.bar(feature_importance, x="features", y="importance")

fig.show()

# YOUR QUEST: Get better R2 and MSE then this ^^

...

# Further topics

* XGBoost
* CatBoost
* H2O

# CatBoost

# Sources:
[Bias/Variance problem](https://www.learnopencv.com/bias-variance-tradeoff-in-machine-learning/)

[Scikit supervized models](https://scikit-learn.org/stable/supervised_learning.html)

[Bayessian regression post](https://stats.stackexchange.com/questions/252577/bayes-regression-how-is-it-done-in-comparison-to-standard-regression/252608)

[XGBoost feature importance](https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/)