In [7]:
import pandas as pd 
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error


In [8]:
X = np.arange(6).reshape(3, 2)

X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [9]:
poly = PolynomialFeatures(1)
poly.fit_transform(X)

array([[1., 0., 1.],
       [1., 2., 3.],
       [1., 4., 5.]])

In [10]:
path = 'C:/Users/isaac/Dropbox/Apps/ShareLaTeX/information_lawyer_quality'

# Read data 
data = pd.read_csv(path + '/_aux/hd_case_value.csv') 
#data = data.sample(frac=0.1, replace=True, random_state=1)
print(data.shape)
data.columns

(5356, 28)


Index(['gen', 'horas_sem', 'sal_caidos', 'prima_antig', 'prima_vac', 'rec20',
       'prima_dom', 'desc_sem', 'desc_ob', 'sarimssinf', 'utilidades',
       'nulidad', 'salario_diario', 'reinst', 'indem', 'horas_extra',
       'antiguedad', 'min_prima_antig', 'min_ag', 'min_vac', 'min_ley',
       'c_recsueldo', 'reclutamiento', 'num_actores', 'liq_total', 'duracion',
       'liq_total_disc', 'hd'],
      dtype='object')

In [11]:
data.describe()

Unnamed: 0,gen,horas_sem,sal_caidos,prima_antig,prima_vac,rec20,prima_dom,desc_sem,desc_ob,sarimssinf,...,min_ag,min_vac,min_ley,c_recsueldo,reclutamiento,num_actores,liq_total,duracion,liq_total_disc,hd
count,5340.0,5256.0,5340.0,5340.0,5334.0,5341.0,5341.0,5341.0,5341.0,5341.0,...,5286.0,5209.0,5336.0,5340.0,5198.0,5356.0,5356.0,5356.0,5356.0,5356.0
mean,0.483895,57.286975,0.982959,0.849438,0.955381,0.254634,0.200899,0.171691,0.290957,0.552893,...,2758.465764,2068.817676,53416.99,21725.77,0.195652,1.141337,30800.8,28.801694,19760.41,0.933906
std,0.499787,15.538088,0.129437,0.357655,0.206486,0.435696,0.40071,0.377147,0.454246,0.497241,...,14974.160132,6613.74997,107801.2,242945.2,0.39674,0.464623,143720.5,153.506144,81496.73,0.24847
min,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.47,0.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,48.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,561.455365,361.97,20595.5,0.0,0.0,1.0,0.0,0.35,0.0,1.0
50%,0.0,60.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1282.1899,868.60999,32480.35,1077.145,0.0,1.0,9000.0,0.76,5884.49,1.0
75%,1.0,66.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,2489.5299,1929.7593,55455.49,7475.002,0.0,1.0,22162.5,1.630137,16503.61,1.0
max,1.0,258.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,725185.75,213154.98,4449881.0,15319810.0,1.0,3.0,6700000.0,1396.0,3453441.0,1.0


In [12]:
X = data[['gen',
            'reclutamiento',
            'salario_diario',
            'horas_sem',
            'reinst',
            'prima_dom',
            'desc_sem',
            'sarimssinf',
            'antiguedad',
            'c_recsueldo',
            'desc_ob',
            'min_prima_antig',
            'min_ag',
            'min_vac',
            'min_ley']]

Z = data[['num_actores',
            'gen',
            'reclutamiento',
            'salario_diario',
            'antiguedad',          
            'horas_sem',
            'reinst',
            'indem',
            'sal_caidos',
            'prima_antig',
            'prima_vac',
            'horas_extra',
            'rec20',
            'prima_dom',
            'desc_sem',
            'sarimssinf',
            'utilidades',
            'nulidad']]

y1 = data['liq_total']
y2 = data['liq_total_disc']

In [13]:
numeric_featuresX = ['salario_diario',
            'horas_sem',
            'antiguedad',
            'c_recsueldo',
            'min_prima_antig',
            'min_ag',
            'min_vac',
            'min_ley']
numeric_featuresZ = ['salario_diario',
            'horas_sem',
            'antiguedad']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())])


categorical_featuresZ = ['num_actores']
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse = False)


binary_featuresX = ['gen',
            'reclutamiento',
            'reinst',
            'prima_dom',
            'desc_sem',
            'sarimssinf',
            'desc_ob']
binary_featuresZ = ['gen',
            'reclutamiento',
            'reinst',
            'indem',
            'sal_caidos',
            'prima_antig',
            'prima_vac',
            'horas_extra',
            'rec20',
            'prima_dom',
            'desc_sem',
            'sarimssinf',
            'utilidades',
            'nulidad']
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree=1, include_bias=False))])


preprocessorX = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_featuresX),
        ('bin', binary_transformer, binary_featuresX)])
preprocessorZ = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_featuresZ),
        ('cat', categorical_transformer, categorical_featuresZ),
        ('bin', binary_transformer, binary_featuresZ)])


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, Z_train, Z_test, y_train, y_test = train_test_split(X, Z, y2, test_size=0.2,
                                                    random_state=0)

In [15]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
from sklearn.linear_model import LinearRegression

In [16]:
# Linear Regression
LRX = Pipeline(steps=[('preprocessor', preprocessorX),
                      ('model', LinearRegression())])
LRX.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('polynomial',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['salario_diario',
                                                   'horas_sem', 'antiguedad',
                                                   'c_recsueldo',
                                                   'min_prima_antig', 'min_ag',
                                                   'min_vac', 'min_ley']),
        

In [9]:
LRZ = Pipeline(steps=[('preprocessor', preprocessorZ),
                      ('model', LinearRegression())])
LRZ.fit(Z_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('polynomial',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['salario_diario',
                                                   'horas_sem', 'antiguedad']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
  

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

In [11]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'model__n_estimators'  : [50, 100, 500, 1000],
              'model__max_features'  : ['auto', 'sqrt', 'log2'],
              'model__max_depth'     : [None, 1, 3, 5, 10, 20],
              'model__subsample'     : [0.5, 1],
              'model__learning_rate' : [0.001, 0.01, 0.1]
             }


GB  = GradientBoostingRegressor(
                        random_state        = 123,
                        # Activación de la parada temprana
                        validation_fraction = 0.1,
                        n_iter_no_change    = 5,
                        tol                 = 0.0001
                    )
# Gradient Boosting
GBX = Pipeline(steps=[('preprocessor', preprocessorX),
                      ('model', GB)])
GBZ = Pipeline(steps=[('preprocessor', preprocessorZ),
                      ('model', GB)])

# Búsqueda por grid search con validación cruzada
# ==============================================================================
gridX = GridSearchCV(GBX,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )
gridZ = GridSearchCV(GBZ,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

gridX.fit(X = X_train, y = y_train)
gridZ.fit(X = Z_train, y = y_train)

GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=5, random_state=123),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('polynomial',
                                                                                          PolynomialFeatures(include_bias=False)),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['salario_diario',
                          

In [13]:
# Resultados
# ==============================================================================
resultados = pd.DataFrame(gridX.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

NameError: name 'gridX' is not defined

In [13]:
modelo_final_GBX = gridX.best_estimator_
modelo_final_GBZ = gridZ.best_estimator_

In [14]:
importancia = permutation_importance(
                estimator    = modelo_final_GBX,
                X            = X_train,
                y            = y_train,
                n_repeats    = 5,
                scoring      = 'neg_root_mean_squared_error',
                n_jobs       = multiprocessing.cpu_count() - 1,
                random_state = 123
             )

# Se almacenan los resultados (media y desviación) en un dataframe
df_importancia = pd.DataFrame(
                    {k: importancia[k] for k in ['importances_mean', 'importances_std']}
                 )
df_importancia['feature'] = X_train.columns
df_importancia.sort_values('importances_mean', ascending=False)

Unnamed: 0,importances_mean,importances_std,feature
2,1369.923209,30.428258,salario_diario
8,1272.040208,76.346678,antiguedad
13,1063.813219,41.270977,min_vac
14,1021.884767,26.354959,min_ley
9,932.458964,37.691371,c_recsueldo
12,625.800035,46.292649,min_ag
11,395.191065,79.74321,min_prima_antig
3,349.643261,84.362845,horas_sem
0,12.981643,5.885754,gen
7,10.458658,7.64226,sarimssinf


In [15]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

In [16]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'model__loss'             : ['least_squares', 'least_absolute_deviation'],
              'model__learning_rate'    : [0.001, 0.01, 0.1],
              'model__max_depth'        : [None, 1, 3, 5, 10, 20],
              'model__l2_regularization': [0, 1]
             }


# Gradient Boosting
HGBX = Pipeline(steps=[('preprocessor', preprocessorX),
                      ('model', HistGradientBoostingRegressor(
                        max_iter            = 10000,
                        # Activación de la parada temprana
                        early_stopping      = True,
                        scoring             = 'loss',
                        validation_fraction = 0.1,
                        n_iter_no_change    = 10,
                        tol                 = 1e-7,
                        random_state        = 123
                    ))])
HGBZ = Pipeline(steps=[('preprocessor', preprocessorZ),
                      ('model', HistGradientBoostingRegressor(
                        max_iter            = 10000,
                        # Activación de la parada temprana
                        early_stopping      = True,
                        scoring             = 'loss',
                        validation_fraction = 0.1,
                        n_iter_no_change    = 10,
                        tol                 = 1e-7,
                        random_state        = 123
                    ))])

# Búsqueda por grid search con validación cruzada
# ==============================================================================
gridX = GridSearchCV(HGBX,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )
gridZ = GridSearchCV(HGBZ,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

gridX.fit(X = X_train, y = y_train)
gridZ.fit(X = Z_train, y = y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(gridX.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_model__l2_regularization,param_model__learning_rate,param_model__loss,param_model__max_depth,mean_test_score,std_test_score,mean_train_score,std_train_score
17,0,0.01,least_squares,20.0,-62004.285236,20479.769375,-59164.501226,5987.460466
12,0,0.01,least_squares,,-62007.484829,20487.917194,-59153.487344,5995.054509
53,1,0.01,least_squares,20.0,-62069.271694,20485.313862,-58762.459814,5997.236032
48,1,0.01,least_squares,,-62080.375746,20477.548613,-58796.357092,5921.68308


In [17]:
modelo_final_HGBX = gridX.best_estimator_
modelo_final_HGBZ = gridZ.best_estimator_

In [18]:
from xgboost import XGBRegressor


In [19]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'model__max_depth'        : [None, 1, 3, 5, 10, 20],
              'model__subsample'        : [0.5, 1],
              'model__learning_rate'    : [0.001, 0.01, 0.1],
              'model__booster'          : ['gbtree'],
              'model__n_estimators'     : [50, 100, 500, 1000],
             }


# Crear conjunto de validación
# ==============================================================================
np.random.seed(123)
idx_validacion = np.random.choice(
                    X_train.shape[0],
                    size= int(X_train.shape[0]*0.1),
                    replace=False
                 )

In [20]:
X_val = X_train.iloc[idx_validacion, :].copy()
Z_val = Z_train.iloc[idx_validacion, :].copy()
y_val = y_train.iloc[idx_validacion].copy()

In [21]:
X_train_grid = X_train.reset_index(drop = True).drop(idx_validacion, axis = 0).copy()
Z_train_grid = Z_train.reset_index(drop = True).drop(idx_validacion, axis = 0).copy()
y_train_grid = y_train.reset_index(drop = True).drop(idx_validacion, axis = 0).copy()

In [22]:
preprocessorX_ = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_featuresX),
        ('bin', binary_transformer, binary_featuresX)],
        remainder='passthrough')
preprocessorZ_ = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_featuresZ),
        ('cat', categorical_transformer, categorical_featuresZ),
        ('bin', binary_transformer, binary_featuresZ)],
        remainder='passthrough')

In [23]:
encoderX = preprocessorX_.fit(X_train)
encoderZ = preprocessorZ_.fit(Z_train)

In [24]:
X_train_grid = encoderX.transform(X_train_grid)
Z_train_grid = encoderZ.transform(Z_train_grid)

In [25]:
X_val = encoderX.transform(X_val)
Z_val = encoderZ.transform(Z_val)

In [26]:
# XGBoost necesita pasar los paramétros específicos del entrenamiento al llamar
# al método .fit()
fit_paramsX = {"model__early_stopping_rounds" : 5, 
              "model__eval_metric"           : "rmse", 
              "model__eval_set"              : [(X_val, y_val)],
              "model__verbose"               : 0
             }
fit_paramsZ = {"model__early_stopping_rounds" : 5, 
              "model__eval_metric"           : "rmse", 
              "model__eval_set"              : [(Z_val, y_val)],
              "model__verbose"               : 0
             }

In [27]:
XGBX = Pipeline(steps=[('model', XGBRegressor(
                        random_state = 123
                    ))])
XGBZ = Pipeline(steps=[('model', XGBRegressor(
                        random_state = 123
                    ))])

In [28]:
# Búsqueda por grid search con validación cruzada
# ==============================================================================
gridX = GridSearchCV(
        estimator  = XGBX,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )
gridZ = GridSearchCV(
        estimator  = XGBZ,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

In [29]:
gridX.fit(X = X_train_grid, y = y_train_grid, **fit_paramsX)
gridZ.fit(X = Z_train_grid, y = y_train_grid, **fit_paramsZ)

GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=5, random_state=123),
             estimator=Pipeline(steps=[('model',
                                        XGBRegressor(base_score=None,
                                                     booster=None,
                                                     colsample_bylevel=None,
                                                     colsample_bynode=None,
                                                     colsample_bytree=None,
                                                     gamma=None, gpu_id=None,
                                                     importance_type='gain',
                                                     interaction_constraints=None,
                                                     learning_rate=None,
                                                     max_delta_step=None,
                                                     max_depth=None,
                                                     min_

In [6]:
# Resultados
# ==============================================================================
resultados = pd.DataFrame(gridX.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

NameError: name 'gridX' is not defined

In [31]:
modelo_final_XGBX = gridX.best_estimator_
modelo_final_XGBZ = gridZ.best_estimator_

In [32]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
param_grid = {'model__n_estimators': [100, 200, 500],
              'model__max_features': ["auto", "sqrt", "log2"],
              'model__max_depth'   : [None, 3, 10, 20],
              'model__criterion'   : ["mse", "mae"]
             }

# Random Forests
RFX = Pipeline(steps=[('preprocessor', preprocessorX),
                      ('model', RandomForestRegressor(random_state=123))])
RFZ = Pipeline(steps=[('preprocessor', preprocessorZ),
                      ('model', RandomForestRegressor(random_state=123))])

# Búsqueda por grid search con validación cruzada
# ==============================================================================
gridX = GridSearchCV(
        estimator  = RFX,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )
gridZ = GridSearchCV(
        estimator  = RFZ,
        param_grid = param_grid,
        scoring    = 'neg_root_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

gridX.fit(X = X_train, y = y_train)
gridZ.fit(X = Z_train, y = y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(gridX.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

In [None]:
modelo_final_RFX = gridX.best_estimator_
modelo_final_RFZ = gridZ.best_estimator_

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_decomposition import PLSRegression
#from sklearn.pipeline import make_pipeline

In [None]:
# PCR
pcrX = Pipeline(steps=[('preprocessor', preprocessorX),
                     ('components', TruncatedSVD(n_components=10)),
                      ('model', LinearRegression())])
pcrZ = Pipeline(steps=[('preprocessor', preprocessorZ),
                     ('components', TruncatedSVD(n_components=10)),
                      ('model', LinearRegression())])

pcrX.fit(X_train, y_train)
pcrZ.fit(Z_train, y_train)


In [None]:
from sklearn.preprocessing import FunctionTransformer

plsX = Pipeline(steps=[('preprocessor', preprocessorX),
                    ('model', PLSRegression(n_components=10))])
plsZ = Pipeline(steps=[('preprocessor', preprocessorZ),
                    ('model', PLSRegression(n_components=10))])



In [None]:
plsX.fit(X_train, y_train)
plsZ.fit(Z_train, y_train)

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
# Espacio de búsqueda de cada hiperparámetro
# ==============================================================================
param_distributions = {
    'modelo__hidden_layer_sizes': [(10), (20), (10, 10), (100,)],
    'modelo__alpha': np.logspace(-5, 0, 10),
    'modelo__learning_rate_init': [0.001, 0.01],
    'modelo__solver': ['lbfgs', 'adam'],
}

# Se combinan los pasos de preprocesado y el modelo en un mismo pipeline
MLPX = Pipeline([('preprocessing', preprocessorX),
                 ('modelo', MLPRegressor(max_iter=10000, random_state=123))])
MLPZ = Pipeline([('preprocessing', preprocessorZ),
                 ('modelo', MLPRegressor(max_iter=10000, random_state=123))])

# Búsqueda por validación cruzada
# ==============================================================================
gridX = RandomizedSearchCV(
        estimator  = MLPX,
        param_distributions = param_distributions,
        n_iter     = 100,
        scoring    = 'neg_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123),  
        verbose    = 0,
        random_state = 123,
        return_train_score = True
       )
gridZ = RandomizedSearchCV(
        estimator  = MLPZ,
        param_distributions = param_distributions,
        n_iter     = 100,
        scoring    = 'neg_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123),  
        verbose    = 0,
        random_state = 123,
        return_train_score = True
       )

gridX.fit(X = X_train, y = y_train)
gridZ.fit(X = Z_train, y = y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(gridX.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

In [None]:
modelo_final_MLPX = gridX.best_estimator_
modelo_final_MLPZ = gridZ.best_estimator_

In [None]:
#Metric Results

metric_results_oosX = pd.DataFrame({'Linear Regression' :  [explained_variance_score(y_test, LRX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, LRX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, LRX.predict(X_test)),
    mean_squared_error(y_test, LRX.predict(X_test)),
    mean_absolute_percentage_error(y_test, LRX.predict(X_test))], 
                                     'GB' :  [explained_variance_score(y_test, modelo_final_GBX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_GBX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_GBX.predict(X_test)),
    mean_squared_error(y_test, modelo_final_GBX.predict(X_test)),
    mean_absolute_percentage_error(y_test, modelo_final_GBX.predict(X_test))],
                                     'HGB' :  [explained_variance_score(y_test, modelo_final_HBX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_HBX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_HBX.predict(X_test)),
    mean_squared_error(y_test, modelo_final_HBX.predict(X_test)),
    mean_absolute_percentage_error(y_test, modelo_final_HBX.predict(X_test))],
                                    'XGB' :  [explained_variance_score(y_test, modelo_final_XGBX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_XGBX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_XGBX.predict(X_test)),
    mean_squared_error(y_test, modelo_final_XGBX.predict(X_test)),
    mean_absolute_percentage_error(y_test, modelo_final_XGBX.predict(X_test))],
                                    'RF' :  [explained_variance_score(y_test, modelo_final_RFX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_RFX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_RFX.predict(X_test)),
    mean_squared_error(y_test, modelo_final_RFX.predict(X_test)),
    mean_absolute_percentage_error(y_test, modelo_final_RFX.predict(X_test))],
                                    'PC Regression' :  [explained_variance_score(y_test, pcrX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, pcrX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, pcrX.predict(X_test)),
    mean_squared_error(y_test, pcrX.predict(X_test)),
    mean_absolute_percentage_error(y_test, pcrX.predict(X_test))], 
                                    'PLS' :  [explained_variance_score(y_test, plsX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, plsX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, plsX.predict(X_test)),
    mean_squared_error(y_test, plsX.predict(X_test)),
    mean_absolute_percentage_error(y_test, plsX.predict(X_test))], 
                                    'MLP' :  [explained_variance_score(y_test, modelo_final_MLPX.predict(X_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_MLPX.predict(X_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_MLPX.predict(X_test)),
    mean_squared_error(y_test, modelo_final_MLPX.predict(X_test)),
    mean_absolute_percentage_error(y_test, modelo_final_MLPX.predict(X_test))]
                        })    


metric_results_insX = pd.DataFrame({'Linear Regression' :  [explained_variance_score(y_train, LRX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, LRX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, LRX.predict(X_train)),
    mean_squared_error(y_train, LRX.predict(X_train)),
    mean_absolute_percentage_error(y_train, LRX.predict(X_train))], 
                                     'GB' :  [explained_variance_score(y_train, modelo_final_GBX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_GBX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_GBX.predict(X_train)),
    mean_squared_error(y_train, modelo_final_GBX.predict(X_train)),
    mean_absolute_percentage_error(y_train, modelo_final_GBX.predict(X_train))],
                                     'HGB' :  [explained_variance_score(y_train, modelo_final_HBX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_HBX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_HBX.predict(X_train)),
    mean_squared_error(y_train, modelo_final_HBX.predict(X_train)),
    mean_absolute_percentage_error(y_train, modelo_final_HBX.predict(X_train))],
                                    'XGB' :  [explained_variance_score(y_train, modelo_final_XGBX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_XGBX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_XGBX.predict(X_train)),
    mean_squared_error(y_train, modelo_final_XGBX.predict(X_train)),
    mean_absolute_percentage_error(y_train, modelo_final_XGBX.predict(X_train))],
                                    'RF' :  [explained_variance_score(y_train, modelo_final_RFX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_RFX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_RFX.predict(X_train)),
    mean_squared_error(y_train, modelo_final_RFX.predict(X_train)),
    mean_absolute_percentage_error(y_train, modelo_final_RFX.predict(X_train))],
                                    'PC Regression' :  [explained_variance_score(y_train, pcrX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, pcrX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, pcrX.predict(X_train)),
    mean_squared_error(y_train, pcrX.predict(X_train)),
    mean_absolute_percentage_error(y_train, pcrX.predict(X_train))], 
                                    'PLS' :  [explained_variance_score(y_train, plsX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, plsX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, plsX.predict(X_train)),
    mean_squared_error(y_train, plsX.predict(X_train)),
    mean_absolute_percentage_error(y_train, plsX.predict(X_train))], 
                                    'MLP' :  [explained_variance_score(y_train, modelo_final_MLPX.predict(X_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_MLPX.predict(X_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_MLPX.predict(X_train)),
    mean_squared_error(y_train, modelo_final_MLPX.predict(X_train)),
    mean_absolute_percentage_error(y_train, modelo_final_MLPX.predict(X_train))]
                        })     
     
metric_results_oosZ = pd.DataFrame({'Linear Regression' :  [explained_variance_score(y_test, LRZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, LRZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, LRZ.predict(Z_test)),
    mean_squared_error(y_test, LRZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, LRZ.predict(Z_test))], 
                                     'GB' :  [explained_variance_score(y_test, modelo_final_GBZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_GBZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_GBZ.predict(Z_test)),
    mean_squared_error(y_test, modelo_final_GBZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, modelo_final_GBZ.predict(Z_test))],
                                     'HGB' :  [explained_variance_score(y_test, modelo_final_HBZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_HBZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_HBZ.predict(Z_test)),
    mean_squared_error(y_test, modelo_final_HBZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, modelo_final_HBZ.predict(Z_test))],
                                    'XGB' :  [explained_variance_score(y_test, modelo_final_XGBZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_XGBZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_XGBZ.predict(Z_test)),
    mean_squared_error(y_test, modelo_final_XGBZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, modelo_final_XGBZ.predict(Z_test))],
                                    'RF' :  [explained_variance_score(y_test, modelo_final_RFZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_RFZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_RFZ.predict(Z_test)),
    mean_squared_error(y_test, modelo_final_RFZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, modelo_final_RFZ.predict(Z_test))],
                                    'PC Regression' :  [explained_variance_score(y_test, pcrZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, pcrZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, pcrZ.predict(Z_test)),
    mean_squared_error(y_test, pcrZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, pcrZ.predict(Z_test))], 
                                    'PLS' :  [explained_variance_score(y_test, plsZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, plsZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, plsZ.predict(Z_test)),
    mean_squared_error(y_test, plsZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, plsZ.predict(Z_test))], 
                                    'MLP' :  [explained_variance_score(y_test, modelo_final_MLPZ.predict(Z_test), multioutput='uniform_average'),
    r2_score(y_test, modelo_final_MLPZ.predict(Z_test), multioutput='variance_weighted'),
    mean_absolute_error(y_test, modelo_final_MLPZ.predict(Z_test)),
    mean_squared_error(y_test, modelo_final_MLPZ.predict(Z_test)),
    mean_absolute_percentage_error(y_test, modelo_final_MLPZ.predict(Z_test))]
                        })    


metric_results_insZ = pd.DataFrame({'Linear Regression' :  [explained_variance_score(y_train, LRZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, LRZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, LRZ.predict(Z_train)),
    mean_squared_error(y_train, LRZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, LRZ.predict(Z_train))], 
                                     'GB' :  [explained_variance_score(y_train, modelo_final_GBZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_GBZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_GBZ.predict(Z_train)),
    mean_squared_error(y_train, modelo_final_GBZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, modelo_final_GBZ.predict(Z_train))],
                                     'HGB' :  [explained_variance_score(y_train, modelo_final_HBZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_HBZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_HBZ.predict(Z_train)),
    mean_squared_error(y_train, modelo_final_HBZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, modelo_final_HBZ.predict(Z_train))],
                                    'XGB' :  [explained_variance_score(y_train, modelo_final_XGBZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_XGBZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_XGBZ.predict(Z_train)),
    mean_squared_error(y_train, modelo_final_XGBZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, modelo_final_XGBZ.predict(Z_train))],
                                    'RF' :  [explained_variance_score(y_train, modelo_final_RFZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_RFZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_RFZ.predict(Z_train)),
    mean_squared_error(y_train, modelo_final_RFZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, modelo_final_RFZ.predict(Z_train))],
                                    'PC Regression' :  [explained_variance_score(y_train, pcrZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, pcrZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, pcrZ.predict(Z_train)),
    mean_squared_error(y_train, pcrZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, pcrZ.predict(Z_train))], 
                                    'PLS' :  [explained_variance_score(y_train, plsZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, plsZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, plsZ.predict(Z_train)),
    mean_squared_error(y_train, plsZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, plsZ.predict(Z_train))], 
                                    'MLP' :  [explained_variance_score(y_train, modelo_final_MLPZ.predict(Z_train), multioutput='uniform_average'),
    r2_score(y_train, modelo_final_MLPZ.predict(Z_train), multioutput='variance_weighted'),
    mean_absolute_error(y_train, modelo_final_MLPZ.predict(Z_train)),
    mean_squared_error(y_train, modelo_final_MLPZ.predict(Z_train)),
    mean_absolute_percentage_error(y_train, modelo_final_MLPZ.predict(Z_train))]
                        })   


# Change the row indexes
metric_results_oosX.index = ['Explained Variance', 'R2', 'MAE', 'MSE', 'MAPE']
metric_results_oosZ.index = ['Explained Variance', 'R2', 'MAE', 'MSE', 'MAPE']

metric_results_insX.index = ['Explained Variance', 'R2', 'MAE', 'MSE', 'MAPE']
metric_results_insZ.index = ['Explained Variance', 'R2', 'MAE', 'MSE', 'MAPE']


In [None]:

metric_results_oosX

In [None]:

metric_results_insX

In [None]:

metric_results_oosZ

In [None]:

metric_results_insZ

In [None]:
print('Finish')

In [2]:
# Resultados
# ==============================================================================
resultados = pd.DataFrame(gridX.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

NameError: name 'pd' is not defined

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from scipy.stats import uniform

# Espacio de búsqueda de cada hiperparámetro
# ==============================================================================
param_distributions = {
    'modelo__alpha': uniform(0.001, 10) ,
    'modelo__l1_ratio' : uniform(0.019, 0.99)
}

# Se combinan los pasos de preprocesado y el modelo en un mismo pipeline
ENX = Pipeline([('preprocessing', preprocessorX),
                 ('modelo', ElasticNet(random_state=123))])

ENZ = Pipeline([('preprocessing', preprocessorZ),
                 ('modelo', ElasticNet(random_state=123))])


# Búsqueda por validación cruzada
# ==============================================================================
gridX = RandomizedSearchCV(
        estimator  = ENX,
        param_distributions = param_distributions,
        n_iter     = 1000,
        scoring    = 'neg_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123),  
        verbose    = 4,
        random_state = 123,
        return_train_score = True
       )
gridZ = RandomizedSearchCV(
        estimator  = ENZ,
        param_distributions = param_distributions,
        n_iter     = 1000,
        scoring    = 'neg_mean_squared_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123),  
        verbose    = 4,
        random_state = 123,
        return_train_score = True
       )

gridX.fit(X = X_train, y = y_train)
gridZ.fit(X = Z_train, y = y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(gridX.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Fitting 15 folds for each of 1000 candidates, totalling 15000 fits


 -7.70202692e+09 -1.17463523e+10 -6.98970499e+09 -1.11366760e+10
 -9.50293709e+09 -8.08372245e+09 -1.17490989e+10 -7.80302753e+09
 -6.68453993e+09 -7.79747546e+09 -1.08967195e+10 -1.61527848e+10
 -8.41040312e+09 -7.69431936e+09 -1.86277038e+10 -6.76304245e+09
 -6.47100533e+09 -8.84946607e+09 -6.20937654e+09 -1.28160807e+11
 -8.64457482e+09 -3.28601204e+10 -7.85609490e+09 -8.17144462e+09
 -1.00264660e+10 -6.93413332e+09 -7.83757084e+09 -8.62689257e+09
 -5.94505776e+09 -6.39776410e+09 -1.23526871e+10 -5.55785800e+10
 -9.10756273e+09 -7.65505930e+10 -6.75753758e+09 -1.75835789e+10
 -1.14229094e+10 -7.38787838e+09 -9.76805843e+09 -7.35839050e+09
 -8.88409101e+09             nan -1.23504955e+10 -8.93579095e+09
 -1.17725427e+10 -9.34441445e+09 -9.13394195e+09 -1.16705292e+10
 -1.08817951e+10 -7.17732818e+09 -6.56133710e+09 -8.57173891e+09
 -2.52594011e+10 -9.50978938e+09 -7.83343139e+09 -2.10004148e+11
 -6.05564657e+09 -9.78019329e+09 -7.48721293e+10 -1.07161186e+10
 -6.68879151e+09 -6.29801

Fitting 15 folds for each of 1000 candidates, totalling 15000 fits


In [None]:
multiprocessing.cpu_count()