In [1]:
import os

In [2]:
import numpy as np
import pandas as pd

RANDOM_SEED = 42

In [3]:
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", None); 
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", None); 
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", None); 
MLFLOW_S3_ENDPOINT_URL = os.getenv("MLFLOW_S3_ENDPOINT_URL", None); 
MLFLOW_S3_IGNORE_TLS = os.getenv("MLFLOW_S3_IGNORE_TLS", None); 
MLFLOW_BUCKET_NAME = os.getenv("MLFLOW_BUCKET_NAME", None); 
MLFLOW_SERVER = os.getenv("MLFLOW_SERVER", None); 

print("MINIO_ACCESS_KEY", MINIO_ACCESS_KEY)
print("MINIO_SECRET_KEY", MINIO_SECRET_KEY)
print("MLFLOW_TRACKING_URI", MLFLOW_TRACKING_URI)
print("MLFLOW_S3_ENDPOINT_URL", MLFLOW_S3_ENDPOINT_URL)
print("MLFLOW_S3_IGNORE_TLS", MLFLOW_S3_IGNORE_TLS)
print("MLFLOW_BUCKET_NAME", MLFLOW_BUCKET_NAME)
print("MLFLOW_SERVER", MLFLOW_SERVER)


MINIO_ACCESS_KEY test_menio_access_key
MINIO_SECRET_KEY test_minio_secret_key
MLFLOW_TRACKING_URI "postgresql://mlflow:mlflow_pass@postgres:5432"
MLFLOW_S3_ENDPOINT_URL http://minio:9000
MLFLOW_S3_IGNORE_TLS true
MLFLOW_BUCKET_NAME mlflow-artifacts
MLFLOW_SERVER http://mlflow:5000


In [8]:
!wget -O train.csv "https://raw.githubusercontent.com/jelambrar96-datatalks/house-price-predictor/refs/heads/main/dataset/train.csv"
pass

--2025-01-12 16:43:43--  https://raw.githubusercontent.com/jelambrar96-datatalks/house-price-predictor/refs/heads/main/dataset/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460676 (450K) [text/plain]
Saving to: ‘train.csv’


2025-01-12 16:43:43 (5.25 MB/s) - ‘train.csv’ saved [460676/460676]



In [9]:
df_full = pd.read_csv("train.csv")

In [10]:
import re

# Function to convert camelCase or PascalCase to snake_case
def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

df_full.columns = [to_snake_case(col) for col in df_full.columns]

In [11]:
df_full.drop(
    columns=["id", "alley", "pool_qc", "fence", "misc_feature", "mas_vnr_type", "fireplace_qu", "lot_frontage"],
    inplace=True
    )
df_full.dropna(inplace=True)

In [12]:
df_full.shape

(1338, 73)

In [13]:
TARGET_COLUMN = "sale_price"

df_full[TARGET_COLUMN] = np.log1p(df_full[TARGET_COLUMN])

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df_full_train, df_test = train_test_split(
    df_full, test_size=0.2, random_state=RANDOM_SEED)

df_full_train = df_full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = (df_full_train[TARGET_COLUMN]).astype('int').values
y_test = (df_test[TARGET_COLUMN]).astype('int').values

del df_full_train[TARGET_COLUMN]
del df_test[TARGET_COLUMN]


In [16]:
from sklearn.preprocessing import StandardScaler

numerical_cols = df_full_train.select_dtypes(include=['number']).columns
scaler = StandardScaler()

df_full_train[numerical_cols] = scaler.fit_transform(df_full_train[numerical_cols])

In [17]:
from sklearn.feature_extraction import DictVectorizer

categorical_cols = df_full_train.select_dtypes(include=['object']).columns
dv = DictVectorizer(sparse=False)

full_train_dict = df_full_train.to_dict(orient='records')
X_full_train = dv.fit_transform(full_train_dict)

In [31]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


def select_best_model(
    estimator,
    params,
    X_full_train,
    y_full_train,
    random_state=None,
    n_jobs=-1):

    rs = RandomizedSearchCV(
        estimator,
        params,
        random_state=random_state,
        n_jobs=n_jobs
    )
    search = rs.fit(X_full_train, y_full_train)
    best_params = search.best_params_
    best_estimator = search.best_estimator_
    best_score = search.best_score_
    
    return best_estimator, best_params, best_score

In [46]:
from sklearn.linear_model import LinearRegression

linear_regression_params = {
    'fit_intercept': [True, False]
}

linear_regression_grid_search = GridSearchCV(
    estimator=LinearRegression(),
    param_grid=linear_regression_params,
    n_jobs=-1)
linear_regression_grid_search_fitted = linear_regression_grid_search.fit(X_full_train, y_full_train)

best_linear_regression_estimator = linear_regression_grid_search_fitted.best_estimator_
best_linear_regression_params = linear_regression_grid_search_fitted.best_params_
best_linear_regression_score = linear_regression_grid_search_fitted.best_score_

In [44]:
from sklearn.linear_model import Lasso

lasso_params = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

lasso_grid_search = GridSearchCV(
    estimator=Lasso(),
    param_grid=lasso_params,
    n_jobs=-1)
lasso_grid_search_fitted = lasso_grid_search.fit(X_full_train, y_full_train)

best_lasso_estimator = lasso_grid_search_fitted.best_estimator_
best_lasso_params = lasso_grid_search_fitted.best_params_
best_lasso_score = lasso_grid_search_fitted.best_score_


In [47]:
from sklearn.tree import DecisionTreeRegressor

decision_tree_params = {
    'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ["sqrt", "log2", 0.5, 1.0],
    'random_state': [RANDOM_SEED]
}

decision_tree_search = RandomizedSearchCV(
    estimator=DecisionTreeRegressor(),
    param_distributions=decision_tree_params,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
decision_tree_search_fitted = decision_tree_search.fit(X_full_train, y_full_train)

best_decision_tree_estimator = decision_tree_search_fitted.best_estimator_
best_decision_tree_params = decision_tree_search_fitted.best_params_
best_decision_tree_score = decision_tree_search_fitted.best_score_


In [49]:
from sklearn.ensemble import RandomForestRegressor

random_forest_params = {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2", 0.5, 1.0],
    "bootstrap": [True, False],
    "oob_score": [True, False],
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "ccp_alpha": [0.0, 0.1, 0.2],
    "random_state": [RANDOM_SEED]
}

random_forest_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(),
    param_distributions=random_forest_params,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
random_forest_search_fitted = random_forest_search.fit(X_full_train, y_full_train)

best_random_forest_estimator = random_forest_search_fitted.best_estimator_
best_random_forest_params = random_forest_search_fitted.best_params_
best_random_forest_score = random_forest_search_fitted.best_score_

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 448, in fit
    raise ValueError("Out of bag estimation only available if bootstrap=True")
ValueError: Out of bag estimation only available if bootstrap=True

 -0.00549691         nan  0.51147192   

In [51]:
from sklearn.ensemble import AdaBoostRegressor

adaboost_regressor_params = {
  "learning_rate": [0.01, 0.1, 1],
  "n_estimators": [10, 50, 100, 200],
  "loss": ["linear", "square", "exponential"],
  "estimator": [
      DecisionTreeRegressor(),
      RandomForestRegressor(),
      # DecisionTreeRegressor(**best_decision_tree_params),
      # RandomForestRegressor(**best_random_forest_params)
  ],
  "random_state": [RANDOM_SEED]
}

adaboost_regressor_search = RandomizedSearchCV(
    estimator=AdaBoostRegressor(),
    param_distributions=adaboost_regressor_params,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
adaboost_regressor_search_fitted = adaboost_regressor_search.fit(X_full_train, y_full_train)

best_adaboost_regressor_estimator = adaboost_regressor_search_fitted.best_estimator_
best_adaboost_regressor_params = adaboost_regressor_search_fitted.best_params_
best_adaboost_regressor_score = adaboost_regressor_search_fitted.best_score_


In [53]:
from sklearn.ensemble import GradientBoostingRegressor

gradientboost_regressor_params = {
  "learning_rate": [0.1, 0.05, 0.01],
  "n_estimators": [50, 100, 200],
  "max_depth": [3, 5, 7],
  "min_samples_split": [2, 5, 10],
  "min_samples_leaf": [1, 2, 4],
  "max_features": ["auto", "sqrt", "log2"],
  "subsample": [1.0, 0.8, 0.5],
  "loss": ["squared_error", "absolute_error", "huber"],
  "alpha": [0.5, 0.75, 0.9],
  "random_state": [RANDOM_SEED]
}

gradientboost_regressor_search = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(),
    param_distributions=gradientboost_regressor_params,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
gradientboost_regressor_search_fitted = gradientboost_regressor_search.fit(X_full_train, y_full_train)


gradientboost_regressor_estimator = gradientboost_regressor_search_fitted.best_estimator_
gradientboost_regressor_params = gradientboost_regressor_search_fitted.best_params_
gradientboost_regressor_score = gradientboost_regressor_search_fitted.best_score_



20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
13 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

In [55]:
from sklearn.neural_network import MLPRegressor

mlp_regressor_params = {
    'hidden_layer_sizes': [(32,), (64,), (128,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['lbfgs', 'adam'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'batch_size': ['auto', 200],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'power_t': [0.5, 1.0, 2.0],
    'max_iter': [500, 1000, 1500],
    'early_stopping': [True, False],
    'validation_fraction': [0.2, 0.5, 0.8]
}

mlp_regressor_search = RandomizedSearchCV(
    estimator=MLPRegressor(),
    param_distributions=mlp_regressor_params,
    n_jobs=-1,
    random_state=RANDOM_SEED
)
mlp_regressor_search_fitted = mlp_regressor_search.fit(X_full_train, y_full_train)

mlp_regressor_estimator = mlp_regressor_search_fitted.best_estimator_
mlp_regressor_params = mlp_regressor_search_fitted.best_params_
mlp_regressor_score = mlp_regressor_search_fitted.best_score_



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("