In [1]:
import numpy as np
import pandas as pd

RANDOM_SEED = 42

In [2]:
!wget -O train.csv "https://github.com/jelambrar96-datatalks/house-price-predictor/blob/main/dataset/train.csv"

--2025-01-06 17:40:41--  https://github.com/jelambrar96-datatalks/house-price-predictor/blob/main/dataset/train.csv
Resolving github.com (github.com)... 140.82.112.3
connected. to github.com (github.com)|140.82.112.3|:443... 
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘train.csv’

train.csv               [     <=>            ] 847.25K   876KB/s    in 1.0s    

2025-01-06 17:40:43 (876 KB/s) - ‘train.csv’ saved [867582]



In [3]:
DATASET_FILE = "../dataset/train.csv"
df_full = pd.read_csv(DATASET_FILE)

In [4]:
import re

# Function to convert camelCase or PascalCase to snake_case
def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

df_full.columns = [to_snake_case(col) for col in df_full.columns]

In [5]:
df_full.drop(
    columns=["id", "alley", "pool_qc", "fence", "misc_feature", "mas_vnr_type", "fireplace_qu", "lot_frontage"],
    inplace=True
    )
df_full.dropna(inplace=True)

In [6]:
df_full.shape

(1338, 73)

In [7]:
TARGET_COLUMN = "sale_price"

df_full[TARGET_COLUMN] = np.log1p(df_full[TARGET_COLUMN])

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_full_train, df_test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=RANDOM_SEED)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train[TARGET_COLUMN]).astype('int').values
y_val = (df_val[TARGET_COLUMN]).astype('int').values
y_test = (df_test[TARGET_COLUMN]).astype('int').values

del df_train[TARGET_COLUMN]
del df_val[TARGET_COLUMN]
del df_test[TARGET_COLUMN]

In [10]:
from sklearn.preprocessing import StandardScaler

numerical_cols = df_train.select_dtypes(include=['number']).columns
scaler = StandardScaler()

df_train[numerical_cols] = scaler.fit_transform(df_train[numerical_cols])
df_val[numerical_cols] = scaler.transform(df_val[numerical_cols])

In [11]:
from sklearn.feature_extraction import DictVectorizer

categorical_cols = df_train.select_dtypes(include=['object']).columns
dv = DictVectorizer(sparse=False)


In [12]:
# import RegressionExperiment and init the class
# from pycaret.regression import RegressionExperiment
# exp = RegressionExperiment()

In [13]:
# init setup on exp
# exp.setup(df_train, target = y_train, session_id = 123)

In [14]:
# best = exp.compare_models()

## auxiliar functions

In [15]:
from sklearn.pipeline import Pipeline

def evaluate_model(model, X_train, y_train, X_val, y_val, params, score_function):
    """
    # Función para evaluar un conjunto de hiperparámetros
    """
    """
    pipeline = Pipeline([
        ('logistic', model(**params))
    ])
    """
    pipeline = model(**params)
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    score = score_function(y_val, y_val_pred)
    return score, pipeline

In [16]:
from itertools import product

def find_best_model(
        Model,
        parameter_grid,
        X_train,
        y_train,
        X_val,
        y_val,
        score_function,
        verbose=False):
    best_score = -np.inf
    best_params = None
    best_model = None

    parameter_labels = parameter_grid.keys()
    parameter_values = parameter_grid.values()

    for temp_parameter_iterable in product(*parameter_values):
        params = { label:value for label, value in zip(parameter_labels, temp_parameter_iterable) }
        if verbose:
            print()
            print(params)

        # Evaluamos los parámetros
        try:
            score, model = evaluate_model(
                Model, X_train, y_train, X_val, y_val, params, score_function
            )
        except ValueError as ve:
            if verbose:
                print(ve)
            continue

        if verbose:
            print(f"score_function: {score}")
        
        # Actualizamos mejor modelo si es necesario
        if score > best_score:
            best_score = score
            best_params = params
            best_model = model

    return best_model, best_params, best_score

In [17]:
from sklearn.metrics import root_mean_squared_error


def score_function_regression(y_true, y_pred, tag="rmse"):
    return -1 * root_mean_squared_error(y_true, y_pred)

In [18]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [19]:
# model 1 lienar regression
from mlmodels.skmodels import ZCLinearRegression

parameter_grid_linear_regression = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'positive': [True, False],
    'n_jobs': [-1, 1, 2, 4]
}

best_model_linear_regression, best_params_linear_regression, best_mrse_linear_regression = find_best_model(
    ZCLinearRegression,
    parameter_grid_linear_regression,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False
)

print("linear regression trained")

linear regression trained


In [20]:
from mlmodels.skmodels import ZCLassoRegression

parameter_grid_lasso_regression = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

best_model_lasso_regression, best_params_lasso_regression, best_lasso_regression = find_best_model(
    ZCLassoRegression,
    parameter_grid_lasso_regression,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False
)

print("lasso regression trained")


lasso regression trained


In [22]:
from mlmodels.skmodels import ZCDecisionTreeRegressor

parameter_grid_decision_tree_regressor = {
    'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ["sqrt", "log2", 0.5, 1.0],
    'random_state': [RANDOM_SEED]
}

best_model_decision_tree_regressor, best_params_decision_tree_regressor, best_decision_tree_regressor = find_best_model(
    ZCDecisionTreeRegressor,
    parameter_grid_decision_tree_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=True
)

print("decition tree regression trained")


{'criterion': 'squared_error', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'random_state': 42}
The 'max_features' parameter of DecisionTreeRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

{'criterion': 'squared_error', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42}
score_function: -0.40055931045730275

{'criterion': 'squared_error', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'random_state': 42}
score_function: -0.42320736951515897

{'criterion': 'squared_error', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'random_state': 42}
score_function: -0.35618207653079775

{'criterion': 'squared_error', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'random_state': 42}
sc

In [24]:
from mlmodels.skmodels import ZCRandomForestRegressor

parameter_grid_random_forest_regressor = {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2", 0.5, 1.0],
    "bootstrap": [True, False],
    "oob_score": [True, False],
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "ccp_alpha": [0.0, 0.1, 0.2],
    "random_state": [RANDOM_SEED]
  }

best_model_random_forest_regressor, best_params_random_forest_regressor, best_random_forest_regressor = find_best_model(
    ZCRandomForestRegressor,
    parameter_grid_random_forest_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=True
)

print("random_forest_regressor trained")


{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'bootstrap': True, 'oob_score': True, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}
The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'bootstrap': True, 'oob_score': True, 'criterion': 'squared_error', 'ccp_alpha': 0.1, 'random_state': 42}
The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'bootstrap': True, 'oob_score': True, 'criterion': 'squared_error', 'ccp_alpha': 0.2, 

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


score_function: -0.2833699127689575

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.1, 'random_state': 42}


  warn(


score_function: -0.3888367650451103

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}


  warn(
  warn(
  warn(
  warn(


score_function: -0.5126751611844699

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.25208089201564443

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


score_function: -0.2920590821438131

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.40186134090433834

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5984744286860092

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.2895570471981118

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state

  warn(
  warn(
  warn(
  warn(
  warn(


score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.29900331950436077

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.50008388859413

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.2, 'random_

  warn(
  warn(
  warn(
  warn(


score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.23404298622899397

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.3929975791419667

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.0, 'random_state': 4

  warn(
  warn(
  warn(


score_function: -0.28158665589143145

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.1, 'random_state': 42}


  warn(


score_function: -0.3350762889298272

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}


  warn(
  warn(
  warn(
  warn(


score_function: -0.3350762889298272

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.23115543649435435

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}
score_functi

  warn(
  warn(
  warn(


score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.237288790610205

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.35851843871302436

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.2, 'random_state': 42}


  warn(
  warn(
  warn(


score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.0, 'random_state': 42}


  warn(


score_function: -0.26953525508056136

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.1, 'random_state': 42}


  warn(


score_function: -0.36803228281501327

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}


  warn(
  warn(


score_function: -0.36803228281501327

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.2342820089155216

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state': 42}


  warn(
  warn(


score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.237288790610205

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.35851843871302436

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 1.0, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.2, 'random_state': 42}

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.2891756135404707

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.50008388859413

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.0, 'random_

  warn(
  warn(
  warn(
  warn(
  warn(


score_function: -0.41533119314590383

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5126751611844699

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.29492775009158034

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


score_function: -0.3277154071137051

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.40018652367532725

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5984744286860092

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}


  warn(
  warn(
  warn(
  warn(
  warn(


score_function: -0.30607255400655675

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.30243173391580747

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.1, 'random_stat

  warn(
  warn(
  warn(
  warn(


score_function: -0.26125908430310385

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'squared_error', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.3929975791419667

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'squared_error', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.26125908430310385

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.1, 'random_state': 

  warn(
  warn(


score_function: -0.3929975791419667

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.0, 'random_state': 42}


  warn(


score_function: -0.2699675215402163

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.1, 'random_state': 42}


  warn(


score_function: -0.3350762889298272

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}


  warn(
  warn(
  warn(
  warn(


score_function: -0.3350762889298272

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.25695419523831864

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}
score_functi

  warn(
  warn(
  warn(


score_function: -0.2567668173729476

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'squared_error', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.35851843871302436

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'squared_error', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.0, 'random_state': 42}


  warn(
  warn(
  warn(


score_function: -0.2567668173729476

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.35851843871302436

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'friedman_mse', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.0, 'random_state': 42}


  warn(


score_function: -0.2646103866374445

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.1, 'random_state': 42}


  warn(


score_function: -0.36803228281501327

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'random_state': 42}


  warn(
  warn(
  warn(


score_function: -0.36803228281501327

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.25165912280949093

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': True, 'criterion': 'poisson', 'ccp_alpha': 0.2, 'random_state': 42}


  warn(


score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.0, 'random_state': 42}
score_function: -0.2567668173729476

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.1, 'random_state': 42}
score_function: -0.35851843871302436

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': False, 'criterion': 'squared_error', 'ccp_alpha': 0.2, 'random_state': 42}
score_function: -0.5204203455586334

{'n_estimators': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 1.0, 'bootstrap': True, 'oob_score': False, 'criterion': 'friedman_mse', 'ccp_alpha': 0.0, 'random_stat

KeyboardInterrupt: 

In [29]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from mlmodels.skmodels import ZCAdaBoostRegressor


parameter_grid_adaboost_regressor = {
  "learning_rate": [0.1, 0.5, 1],
  "n_estimators": [10, 50, 100, 200],
  "loss": ["linear", "square", "exponential"],
  "estimator": [DecisionTreeRegressor(), RandomForestRegressor()],
  "random_state": [RANDOM_SEED]
}

best_model_adaboost_regressor, best_params_adaboost_regressor, best_adaboost_regressor = find_best_model(
    ZCAdaBoostRegressor,
    parameter_grid_adaboost_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=True
)

print("lasso regression trained")


{'learning_rate': 0.1, 'n_estimators': 10, 'loss': 'linear', 'estimator': DecisionTreeRegressor(), 'random_state': 42}
score_function: -0.28651274358884277

{'learning_rate': 0.1, 'n_estimators': 10, 'loss': 'linear', 'estimator': RandomForestRegressor(), 'random_state': 42}
score_function: -0.23718183716422517

{'learning_rate': 0.1, 'n_estimators': 10, 'loss': 'square', 'estimator': DecisionTreeRegressor(), 'random_state': 42}
score_function: -0.3401053392568908

{'learning_rate': 0.1, 'n_estimators': 10, 'loss': 'square', 'estimator': RandomForestRegressor(), 'random_state': 42}
score_function: -0.23353704682299545

{'learning_rate': 0.1, 'n_estimators': 10, 'loss': 'exponential', 'estimator': DecisionTreeRegressor(), 'random_state': 42}
score_function: -0.3174055271363692

{'learning_rate': 0.1, 'n_estimators': 10, 'loss': 'exponential', 'estimator': RandomForestRegressor(), 'random_state': 42}
score_function: -0.23807452113138922

{'learning_rate': 0.1, 'n_estimators': 50, 'loss'

KeyboardInterrupt: 

In [30]:
from mlmodels.skmodels import ZCGradientBoostingRegressor

parameter_grid_gradientboost_regressor = {
  "learning_rate": [0.1, 0.05, 0.01],
  "n_estimators": [50, 100, 200],
  "max_depth": [3, 5, 7],
  "min_samples_split": [2, 5, 10],
  "min_samples_leaf": [1, 2, 4],
  "max_features": ["auto", "sqrt", "log2"],
  "subsample": [1.0, 0.8, 0.5],
  "loss": ["squared_error", "absolute_error", "huber"],
  "alpha": [0.5, 0.75, 0.9],
  "random_state": [RANDOM_SEED]
}

best_model_gradientboost_regressor, best_params_gradientboost_regressor, best_gradientboost_regressor = find_best_model(
    ZCGradientBoostingRegressor,
    parameter_grid_gradientboost_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=True
)

print("grandient boost trained")


{'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'subsample': 1.0, 'loss': 'squared_error', 'alpha': 0.5, 'random_state': 42}
The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

{'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'subsample': 1.0, 'loss': 'squared_error', 'alpha': 0.75, 'random_state': 42}
The 'max_features' parameter of GradientBoostingRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

{'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'subsample': 1.0, 'loss': 'squared_error', 'alpha': 0.9, 'random_state': 42}
T

KeyboardInterrupt: 