In [80]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, HuberRegressor, RidgeCV, BayesianRidge, Ridge 
from sklearn.svm import SVR, LinearSVR
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, FunctionTransformer, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
# from dython.nominal import associations

%matplotlib inline

In [81]:
def export_to_kaggle_submission(test_df: pd.DataFrame, y_pred_scaled: pd.Series, filename: str):
    y_pred = np.expm1(y_pred_scaled)
    y_pred[y_pred < 0] = 0

    test_df['predicted'] = y_pred
    test_df.index.names = ['id']
    test_df['predicted'].to_csv(f"../../delivery/predictions/{filename}.csv")

In [82]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    y_pred[y_pred < 0] = 0
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [83]:
all_data = pd.read_csv("../../own_data/all_with_stores_pop.csv")
all_data.set_index(["dataset", "range_index"], inplace=True)
all_data['in_mall'] = all_data['mall_name'].notna()
all_data['in_chain'] = all_data['chain_name'].notna()
all_data['mall_name'] = all_data['mall_name'].fillna("None")
all_data['as'] = all_data['store_name'].str.contains(r"\b(AS)\b", case=False, regex=True)
all_data['chain_name'] = all_data['chain_name'].fillna("None")
all_data['busstop_id'] = all_data['busstop_id'].map(str)
all_data['lv1'] = all_data['lv1'].map(str)
all_data['lv2'] = all_data['lv2'].map(str)
all_data['lv3'] = all_data['lv3'].map(str)
all_data['lv4'] = all_data['lv4'].map(str)
all_data.drop(columns=[
  
    'store_name',
    'address',
    'importance_level',
    'busstop_id', 
    'other_stores_50', 
    'buss_stops_300', 
    'municipality_name', 
    'lv1', 
    'lat', 
    'couple_children_6_to_17_years', 
    'couple_without_children_x', 
    'single_parent_children_0_to_5_years', 
    'singles_x', 
    'singles_y', 
    'couple_without_children_y', 
    'couple_with_children', 
    'district_age_0-14_distribution', 
    'district_age_65-90_distribution', 
    'grunnkrets_population', 
    'municipality_density', 
    'all_households', 
    'lv2_population_district_div_count_stores', 
    'lv1_population_municipality_div_count_stores', 
    'lv2_population_municipality_div_count_stores', 
    'in_mall', 
    'lv3_population_district_div_count_stores', 
    'district_name', 
    'num_of_buss_stops_closer_that_1000_to_busstop', 
    'municipality_age_0-14_distribution', 
    'municipality_age_35-64_distribution', 
    'municipality_age_65-90_distribution', 
    ], inplace=True)

data_with_label = all_data.loc[["train"]]
data_with_label["first_index"] = data_with_label["store_id"].str.split("-").str[0]
data_with_label["second_index"] = data_with_label["store_id"].str.split("-").str[1]
data_with_label["third_index"] = data_with_label["store_id"].str.split("-").str[2]

data_with_label.set_index('store_id', inplace=True)
data_without_label = all_data.loc[['test']]
data_without_label["first_index"] = data_without_label["store_id"].str.split("-").str[0]
data_without_label["second_index"] = data_without_label["store_id"].str.split("-").str[1]
data_without_label["third_index"] = data_without_label["store_id"].str.split("-").str[2]


data_without_label.set_index('store_id', inplace=True)
data_without_label.drop(columns=["revenue"], inplace=True)


train, test = train_test_split(data_with_label, test_size=0.2, random_state=3)
#train = train[train.revenue > 0]
#train = train[train.revenue != 1]
#train = train[train.revenue != 0.5]

train = data_with_label
train = train[train.revenue > 0]
#train = train[train.revenue != 1]
#train = train[train.revenue != 0.5]

X_train, y_train = train.loc[:, train.columns != 'revenue'], train['revenue']
X_test, y_test = test.loc[:, test.columns != 'revenue'], test['revenue']
y_train_scaled = np.log1p(y_train)
y_test_scaled = np.log1p(y_test)

X_test = data_without_label

X = data_with_label.drop(columns="revenue")
y_scaled = np.log1p(data_with_label.revenue)

  all_data['as'] = all_data['store_name'].str.contains(r"\b(AS)\b", case=False, regex=True)


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=1)

In [85]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
numeric_features = list(numeric_features.to_numpy())

print(numeric_features)

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
       # ("feature_creation", PolynomialFeatures()),
       # ('feature_selection', SelectFromModel(LinearSVR())),
        ]
)

categorical_features = X_train.select_dtypes(include=[np.object0]).columns
categorical_features_include_bool = list(categorical_features.to_numpy())
categorical_features_include_bool.extend(list(X_train.select_dtypes(include=[np.bool8]).columns.to_numpy()))
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(fill_value="missing", strategy="constant")),
        ("onehotencoding", OneHotEncoder(handle_unknown="ignore"))
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features_include_bool),
    ],
    remainder="passthrough",
)

categorical_features_include_bool = list(categorical_features.to_numpy())
categorical_features_include_bool.extend(list(X_train.select_dtypes(include=[np.bool8]).columns.to_numpy()))
# categorical_transformer_ordinal = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
# preprocessor_ordinal = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, numeric_features),
#         ("cat", categorical_transformer_ordinal, categorical_features_include_bool)
#     ],
#     remainder="passthrough",
# )

['grunnkrets_id', 'lon', 'other_stores_1000', 'other_stores_100', 'other_stores_250', 'buss_stops_1000', 'grunnkrets_1', 'distance_closest_busstop', 'area_km2', 'couple_children_0_to_5_years', 'couple_children_18_or_above', 'single_parent_children_18_or_above', 'single_parent_children_6_to_17_years', 'other_households', 'single_parent_with_children', 'district_age_15-34_distribution', 'district_age_35-64_distribution', 'municipality_age_15-34_distribution', 'district_population', 'municipality_population', 'district_area', 'municipality_area', 'district_density', 'lv1_population_district_div_count_stores', 'lv4_population_district_div_count_stores', 'lv3_population_municipality_div_count_stores', 'lv4_population_municipality_div_count_stores']


In [63]:
svr = LinearSVR(random_state=0, loss="squared_epsilon_insensitive", C=0.2268673277820314, tol=0.0010488018393850595, epsilon=1.31872457533877e-05)

pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("model", 
    svr)])
pipeline.fit(
  X_train,
  y_train
)

In [64]:
preds = pipeline.predict(X_test)
print(rmsle(np.expm1(y_test), np.expm1(preds)))

0.7088277589549292


In [36]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_svr(trial):
    params = {
        'loss': trial.suggest_categorical('loss',['epsilon_insensitive', 'squared_epsilon_insensitive']),
        'C': trial.suggest_float('C', 0, .6),
        'tol': trial.suggest_float('tol', 1e-5, 1e-2, log=True),
        'epsilon': trial.suggest_float('epsilon', 0, 1)
    }
    svr = Pipeline(
        steps=[("preprocessor", preprocessor), 
        ("SVR", 
        LinearSVR(**params, random_state=0))]
    )
    return cross_val_score(svr, X_train, y_train_scaled, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()

In [37]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(), direction="maximize"
)
study.optimize(objective_svr, n_trials=100, timeout=6000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-04 14:37:01,042][0m A new study created in memory with name: no-name-9a0d966a-9205-4b32-9701-52c2b56da435[0m
[32m[I 2022-11-04 14:37:06,893][0m Trial 0 finished with value: -0.7571469324830007 and parameters: {'loss': 'squared_epsilon_insensitive', 'C': 0.01382961699810774, 'tol': 0.0014084886866842915, 'epsilon': 0.7591779596314121}. Best is trial 0 with value: -0.7571469324830007.[0m
[32m[I 2022-11-04 14:37:11,946][0m Trial 1 finished with value: -0.7124828530292056 and parameters: {'loss': 'squared_epsilon_insensitive', 'C': 0.2577806586817436, 'tol': 7.515989469417148e-05, 'epsilon': 0.5743608072640257}. Best is trial 1 with value: -0.7124828530292056.[0m
[32m[I 2022-11-04 14:37:13,943][0m Trial 2 finished with value: -0.7359693190263249 and parameters: {'loss': 'squared_epsilon_insensitive', 'C': 0.3445971722249992, 'tol': 0.00021618373895402446, 'epsilon': 0.8595553301857585}. Best is trial 1 with value: -0.7124828530292056.[0m
[32m[I 2022-11-04 14:37:

Number of finished trials: 100
Best trial:
  Value: -0.6961113480493227
  Params: 
    loss: squared_epsilon_insensitive
    C: 0.2268673277820314
    tol: 0.0010488018393850595
    epsilon: 1.31872457533877e-05


In [9]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_huber(trial):
    params = {
        'fit_intercept': trial.suggest_categorical('fit_intercept',[True, False]),
        'alpha': trial.suggest_float('alpha', 0, 1e-2),
        'tol': trial.suggest_float('tol', 1e-5, 1e-2, log=True),
        'epsilon': trial.suggest_float('epsilon', 1, 5),
        #'max_iter': trial.suggest_int('max_iter', 50, 1000)
    }
    huber = Pipeline(
        steps=[("preprocessor", preprocessor), 
        ("huber", 
        HuberRegressor(**params))]
    )
    return cross_val_score(huber, X_train, y_train_scaled, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()

In [10]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(), direction="maximize"
)
study.optimize(objective_huber, n_trials=100, timeout=6000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-08 08:39:24,567][0m A new study created in memory with name: no-name-a007d822-9394-470e-b73d-4b4f9ca5fe57[0m
[32m[I 2022-11-08 08:39:33,210][0m Trial 0 finished with value: -0.7030465230067734 and parameters: {'fit_intercept': False, 'alpha': 0.004901005101769907, 'tol': 0.009485527162155017, 'epsilon': 4.2908726184230535}. Best is trial 0 with value: -0.7030465230067734.[0m
[32m[I 2022-11-08 08:39:37,945][0m Trial 1 finished with value: -0.7102305329054347 and parameters: {'fit_intercept': False, 'alpha': 6.431403812635961e-05, 'tol': 0.0006922722136190553, 'epsilon': 1.139586886786256}. Best is trial 0 with value: -0.7030465230067734.[0m
[32m[I 2022-11-08 08:39:41,389][0m Trial 2 finished with value: -0.7040186482832228 and parameters: {'fit_intercept': True, 'alpha': 0.0014940168462436187, 'tol': 0.00016302945816946472, 'epsilon': 3.1864768273231223}. Best is trial 0 with value: -0.7030465230067734.[0m
[32m[I 2022-11-08 08:39:44,830][0m Trial 3 finished 

Number of finished trials: 100
Best trial:
  Value: -0.7021436534825313
  Params: 
    fit_intercept: False
    alpha: 0.005177070095980384
    tol: 3.8623461405655734e-05
    epsilon: 4.907148938631252


In [12]:
huber = HuberRegressor(fit_intercept= False,
                    alpha= 0.005177070095980384,
                    tol= 3.8623461405655734e-05,
                    epsilon= 4.907148938631252,
                    max_iter=1000)

pipeline2 = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("model", 
    huber)])
pipeline2.fit(
  X_train,
  y_train_scaled
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [22]:
preds_2 = pipeline2.predict(X_test)
print(rmsle(np.expm1(y_test_scaled), np.expm1(preds_2)))

0.700254846393532


In [None]:
categorical_features = X_train.select_dtypes(include=[np.object0]).columns
categorical_features_include_bool = list(categorical_features.to_numpy())
categorical_features_include_bool.extend(list(X_train.select_dtypes(include=[np.bool8]).columns.to_numpy()))
categorical_transformer_br = Pipeline(
    steps=[
        ("imputer", SimpleImputer(fill_value="missing", strategy="constant")),
        ("onehotencoding", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)


preprocessor_br = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer_br, categorical_features_include_bool),
    ],
    remainder="passthrough",
)

In [24]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_br(trial):
    params = {
        'fit_intercept': trial.suggest_categorical('fit_intercept',[True, False]),
        'normalize': trial.suggest_categorical('normalize',[True, False]),
        'compute_score': trial.suggest_categorical('compute_score',[True, False]),
        'alpha_1': 10**trial.suggest_float('alpha_1', -7, -4),
        'alpha_2': 10**trial.suggest_float('alpha_2', -7, -4),
        'lambda_1': 10**trial.suggest_float('lambda_1', -7, -4),
        'lambda_2': 10**trial.suggest_float('lambda_2', -7, -4),
        'tol': trial.suggest_float('tol', 1e-4, 1e-2, log=True),
        #'max_iter': trial.suggest_int('max_iter', 50, 1000)
    }
    br = Pipeline(
        steps=[("preprocessor", preprocessor_br), 
        ("br", 
        BayesianRidge(**params))]
    )
    return cross_val_score(br, X_train, y_train_scaled, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()

In [25]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(), direction="maximize"
)
study.optimize(objective_br, n_trials=100, timeout=6000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-08 09:10:40,342][0m A new study created in memory with name: no-name-bd67a727-f3e9-407b-b867-b6d6aa94a8d9[0m
[32m[I 2022-11-08 09:11:08,049][0m Trial 0 finished with value: -0.7001524508397796 and parameters: {'fit_intercept': False, 'normalize': True, 'compute_score': True, 'alpha_1': -4.954427690657888, 'alpha_2': -6.672424516308956, 'lambda_1': -4.030023532602176, 'lambda_2': -4.2602557308529105, 'tol': 0.00351194905430385}. Best is trial 0 with value: -0.7001524508397796.[0m
[32m[I 2022-11-08 09:11:36,922][0m Trial 1 finished with value: -0.7001524838552367 and parameters: {'fit_intercept': False, 'normalize': False, 'compute_score': False, 'alpha_1': -5.425654968735715, 'alpha_2': -4.342103237269584, 'lambda_1': -5.311046856460397, 'lambda_2': -5.212486774826799, 'tol': 0.00035396833949458806}. Best is trial 0 with value: -0.7001524508397796.[0m
[32m[I 2022-11-08 09:12:00,325][0m Trial 2 finished with value: -0.7001524648715025 and parameters: {'fit_inter

KeyboardInterrupt: 

In [45]:
bayesridge = BayesianRidge()

pipeline3 = Pipeline(
    steps=[("preprocessor", preprocessor_br), 
    ("model", 
    bayesridge)])
pipeline3.fit(
  X_train,
  y_train_scaled
)

In [26]:
preds_br = pipeline3.predict(X_test)
print(rmsle(np.expm1(y_test_scaled), np.expm1(preds_br)))

0.700254846393532


In [42]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_ridge(trial):
    params = {
        #'fit_intercept': trial.suggest_categorical('fit_intercept',[True, False]),
        'normalize': trial.suggest_categorical('normalize',[True, False]),
        'alpha': 10**trial.suggest_float('alpha_1', -6, 2),
        'tol': trial.suggest_float('tol', 1e-4, 1e-2, log=True),
        'solver': trial.suggest_categorical('solver', ['auto', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
        #'max_iter': trial.suggest_int('max_iter', 50, 1000)
    }
    ridge = Pipeline(
        steps=[("preprocessor", preprocessor), 
        ("ridge", 
        Ridge(**params, fit_intercept=False))]
    )
    return cross_val_score(ridge, X_train, y_train_scaled, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()

In [43]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(), direction="maximize"
)
study.optimize(objective_ridge, n_trials=100, timeout=6000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-08 09:26:19,252][0m A new study created in memory with name: no-name-bbb5aa3e-43c4-447f-bc35-24eaa316ed64[0m
[32m[I 2022-11-08 09:26:28,003][0m Trial 0 finished with value: -0.7025731940284661 and parameters: {'normalize': True, 'alpha_1': -0.3272288090865576, 'tol': 0.0001748075366295853, 'solver': 'sparse_cg'}. Best is trial 0 with value: -0.7025731940284661.[0m
[32m[I 2022-11-08 09:26:47,687][0m Trial 1 finished with value: -0.702025207909036 and parameters: {'normalize': True, 'alpha_1': -3.9135096393793525, 'tol': 0.00030924124089561577, 'solver': 'saga'}. Best is trial 1 with value: -0.702025207909036.[0m
[32m[I 2022-11-08 09:26:48,798][0m Trial 2 finished with value: -0.7040157003234737 and parameters: {'normalize': False, 'alpha_1': -0.518011430953143, 'tol': 0.009249846416037684, 'solver': 'cholesky'}. Best is trial 1 with value: -0.702025207909036.[0m
[32m[I 2022-11-08 09:26:49,731][0m Trial 3 finished with value: -0.7091156741457214 and parameter

KeyboardInterrupt: 

In [86]:
ridge = Ridge()

pipe_ridge = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("model", 
    ridge)])
pipe_ridge.fit(
  X_train,
  y_train
)

In [87]:
preds_ridge = pipe_ridge.predict(X_test)
print(rmsle(np.expm1(y_test), np.expm1(preds_ridge)))

0.7081702684293887


In [27]:
print(rmsle(np.expm1(y_test_scaled), np.expm1(0.25*preds_2+ 0.25*preds + 0.25*preds_ridge + 0.25*preds_br)))

0.6868434576915873


In [51]:
preds_linlibSVR = pipeline.predict(X_test)
preds_Huber = pipeline2.predict(X_test)
preds_BayesRidge = pipeline3.predict(X_test)
preds_ridge = pipe_ridge.predict(X_test)

In [52]:
preds_sub = 0.25*preds_linlibSVR + 0.25*preds_Huber + 0.25*preds_BayesRidge + 0.25*preds_ridge

In [53]:
export_to_kaggle_submission(X_test, preds_sub, "2022-11-08-linear_models_average_no_zeros_optuna")

In [117]:
estimators = [
    ('svr', LinearSVR()),
    ('ridge', RidgeCV()),
    ("huber", HuberRegressor()),
    ("br", BayesianRidge())
]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=41)
)

pipeline_stacking = Pipeline(
    steps=[("preprocessor", preprocessor_br), 
    ("model", 
    reg)])
pipeline_stacking.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [120]:
preds_stacking = pipeline_stacking.predict(X_test)
print(rmsle(np.expm1(y_test), np.expm1(preds_stacking)))

0.790102235150488


In [170]:
estimators = [
    ('svr', LinearSVR()),
    ('ridge', RidgeCV()),
    ("huber", HuberRegressor()),
    ("br", BayesianRidge())
]
reg = VotingRegressor(
    estimators=estimators
)

pipeline_voting = Pipeline(
    steps=[("preprocessor", preprocessor_br), 
    ("model", 
    reg)])
pipeline_voting.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [171]:
preds_voting = pipeline_voting.predict(X_test)
print(rmsle(np.expm1(y_test), np.expm1(preds_voting)))

0.7131557935659976
