In [1]:
import numpy as np
import pandas as pd

RANDOM_SEED = 42

In [2]:
!wget -O train.csv "https://github.com/jelambrar96-datatalks/house-price-predictor/blob/main/dataset/train.csv"

--2025-01-07 18:02:53--  https://github.com/jelambrar96-datatalks/house-price-predictor/blob/main/dataset/train.csv
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘train.csv’

train.csv               [ <=>                ] 910.64K  --.-KB/s    in 0.04s   

2025-01-07 18:02:54 (21.1 MB/s) - ‘train.csv’ saved [932500]



In [3]:
DATASET_FILE = "../dataset/train.csv"
df_full = pd.read_csv(DATASET_FILE)

In [4]:
import re

# Function to convert camelCase or PascalCase to snake_case
def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

df_full.columns = [to_snake_case(col) for col in df_full.columns]

In [5]:
df_full.drop(
    columns=["id", "alley", "pool_qc", "fence", "misc_feature", "mas_vnr_type", "fireplace_qu", "lot_frontage"],
    inplace=True
    )
df_full.dropna(inplace=True)

In [6]:
df_full.shape

(1338, 73)

In [7]:
TARGET_COLUMN = "sale_price"

df_full[TARGET_COLUMN] = np.log1p(df_full[TARGET_COLUMN])

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_full_train, df_test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=RANDOM_SEED)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train[TARGET_COLUMN]).astype('int').values
y_val = (df_val[TARGET_COLUMN]).astype('int').values
y_test = (df_test[TARGET_COLUMN]).astype('int').values

del df_train[TARGET_COLUMN]
del df_val[TARGET_COLUMN]
del df_test[TARGET_COLUMN]

In [10]:
from sklearn.preprocessing import StandardScaler

numerical_cols = df_train.select_dtypes(include=['number']).columns
scaler = StandardScaler()

df_train[numerical_cols] = scaler.fit_transform(df_train[numerical_cols])
df_val[numerical_cols] = scaler.transform(df_val[numerical_cols])

In [11]:
from sklearn.feature_extraction import DictVectorizer

categorical_cols = df_train.select_dtypes(include=['object']).columns
dv = DictVectorizer(sparse=False)


In [12]:
# import RegressionExperiment and init the class
# from pycaret.regression import RegressionExperiment
# exp = RegressionExperiment()

In [13]:
# init setup on exp
# exp.setup(df_train, target = y_train, session_id = 123)

In [14]:
# best = exp.compare_models()

## auxiliar functions

In [15]:
from sklearn.pipeline import Pipeline

def evaluate_model(model, X_train, y_train, X_val, y_val, params, score_function):
    """
    # Función para evaluar un conjunto de hiperparámetros
    """
    """
    pipeline = Pipeline([
        ('logistic', model(**params))
    ])
    """
    pipeline = model(**params)
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    score = score_function(y_val, y_val_pred)
    return score, pipeline

In [16]:
from itertools import product

def find_best_model(
        Model,
        parameter_grid,
        X_train,
        y_train,
        X_val,
        y_val,
        score_function,
        verbose=False,
        run_once=False):
    best_score = -np.inf
    best_params = None
    best_model = None

    parameter_labels = parameter_grid.keys()
    parameter_values = parameter_grid.values()

    for temp_parameter_iterable in product(*parameter_values):
        params = { label:value for label, value in zip(parameter_labels, temp_parameter_iterable) }
        if verbose:
            print()
            print(params)

        # Evaluamos los parámetros
        try:
            score, model = evaluate_model(
                Model, X_train, y_train, X_val, y_val, params, score_function
            )
        except ValueError as ve:
            if verbose:
                print(ve)
            continue

        if verbose:
            print(f"score_function: {score}")
        
        # Actualizamos mejor modelo si es necesario
        if score > best_score:
            best_score = score
            best_params = params
            best_model = model
        
        if run_once:
            break

    return best_model, best_params, best_score

In [17]:
from sklearn.metrics import root_mean_squared_error


def score_function_regression(y_true, y_pred, tag="rmse"):
    return -1 * root_mean_squared_error(y_true, y_pred)

In [18]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [None]:
# model 1 lienar regression
from mlmodels.skmodels import ZCLinearRegression

parameter_grid_linear_regression = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'positive': [True, False],
    'n_jobs': [-1, 1, 2, 4]
}

best_model_linear_regression, best_params_linear_regression, best_mrse_linear_regression = find_best_model(
    ZCLinearRegression,
    parameter_grid_linear_regression,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False,
    run_once=True
)

print("linear regression trained")

linear regression trained


In [None]:
from mlmodels.skmodels import ZCLassoRegression

parameter_grid_lasso_regression = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

best_model_lasso_regression, best_params_lasso_regression, best_lasso_regression = find_best_model(
    ZCLassoRegression,
    parameter_grid_lasso_regression,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False,
    run_once=True
)

print("lasso regression trained")


lasso regression trained


In [21]:
from mlmodels.skmodels import ZCDecisionTreeRegressor

parameter_grid_decision_tree_regressor = {
    'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ["sqrt", "log2", 0.5, 1.0],
    'random_state': [RANDOM_SEED]
}

best_model_decision_tree_regressor, best_params_decision_tree_regressor, best_decision_tree_regressor = find_best_model(
    ZCDecisionTreeRegressor,
    parameter_grid_decision_tree_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False,
    run_once=True
)

print("decition tree regression trained")

decition tree regression trained


In [None]:
from mlmodels.skmodels import ZCRandomForestRegressor

parameter_grid_random_forest_regressor = {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2", 0.5, 1.0],
    "bootstrap": [True, False],
    "oob_score": [True, False],
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "ccp_alpha": [0.0, 0.1, 0.2],
    "random_state": [RANDOM_SEED]
  }

best_model_random_forest_regressor, best_params_random_forest_regressor, best_random_forest_regressor = find_best_model(
    ZCRandomForestRegressor,
    parameter_grid_random_forest_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False,
    run_once=True
)

print("random_forest_regressor trained")

random_forest_regressor trained


  warn(


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from mlmodels.skmodels import ZCAdaBoostRegressor


parameter_grid_adaboost_regressor = {
  "learning_rate": [0.1, 0.5, 1],
  "n_estimators": [10, 50, 100, 200],
  "loss": ["linear", "square", "exponential"],
  "estimator": [DecisionTreeRegressor(), RandomForestRegressor()],
  "random_state": [RANDOM_SEED]
}

best_model_adaboost_regressor, best_params_adaboost_regressor, best_adaboost_regressor = find_best_model(
    ZCAdaBoostRegressor,
    parameter_grid_adaboost_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False,
    run_once=True
)

print("lasso regression trained")

lasso regression trained


In [None]:
from mlmodels.skmodels import ZCGradientBoostingRegressor

parameter_grid_gradientboost_regressor = {
  "learning_rate": [0.1, 0.05, 0.01],
  "n_estimators": [50, 100, 200],
  "max_depth": [3, 5, 7],
  "min_samples_split": [2, 5, 10],
  "min_samples_leaf": [1, 2, 4],
  "max_features": ["auto", "sqrt", "log2"],
  "subsample": [1.0, 0.8, 0.5],
  "loss": ["squared_error", "absolute_error", "huber"],
  "alpha": [0.5, 0.75, 0.9],
  "random_state": [RANDOM_SEED]
}

best_model_gradientboost_regressor, best_params_gradientboost_regressor, best_gradientboost_regressor = find_best_model(
    ZCGradientBoostingRegressor,
    parameter_grid_gradientboost_regressor,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=False,
    run_once=True
)

print("grandient boost trained")

grandient boost trained


In [None]:
import mlflow
import mlflow.sklearn

mlflow.sklearn.autolog()


# model 1 lienar regression
with mlflow.start_run(run_name="linear_regression"):
    best_model_linear_regression.fit(X_train, y_train)
    y_test_pred = best_model_linear_regression.predict(X_test)
    rmse = score_function_regression(y_test, y_test_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(best_model_linear_regression, "model")


# model 2 lasso regression
with mlflow.start_run(run_name="lasso_regression"):
    best_model_lasso_regression.fit(X_train, y_train)
    y_test_pred = best_model_lasso_regression.predict(X_test)
    rmse = score_function_regression(y_test, y_test_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(best_model_lasso_regression, "model")


# model 3 decision tree regression
with mlflow.start_run(run_name="decision_tree_regression"):
    best_model_decision_tree_regressor.fit(X_train, y_train)
    y_test_pred = best_model_decision_tree_regressor.predict(X_test)
    rmse = score_function_regression(y_test, y_test_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(best_model_decision_tree_regressor, "model")


# model 4 random forest regression
with mlflow.start_run(run_name="random_forest_regressor"):
    best_model_random_forest_regressor.fit(X_train, y_train)
    y_test_pred = best_model_random_forest_regressor.predict(X_test)
    rmse = score_function_regression(y_test, y_test_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(best_model_random_forest_regressor, "model")


# model 5 adaboost regression
with mlflow.start_run(run_name="adaboost_regression"):
    best_model_adaboost_regressor.fit(X_train, y_train)
    y_test_pred = best_model_adaboost_regressor.predict(X_test)
    rmse = score_function_regression(y_test, y_test_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(best_model_adaboost_regressor, "model")


# model 6 gradient boost regression
with mlflow.start_run(run_name="gradient_boost_regression"):
    best_model_gradientboost_regressor.fit(X_train, y_train)
    y_test_pred = best_model_gradientboost_regressor.predict(X_test)
    rmse = score_function_regression(y_test, y_test_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(best_model_gradientboost_regressor, "model")

print("models trained and logged")