In [1]:
import numpy as np
import pandas as pd

RANDOM_SEED = 42

In [2]:
!wget -O train.csv "https://github.com/jelambrar96-datatalks/house-price-predictor/blob/main/dataset/train.csv"

--2025-01-06 03:46:50--  https://github.com/jelambrar96-datatalks/house-price-predictor/blob/main/dataset/train.csv
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘train.csv’

train.csv               [ <=>                ] 848.62K  --.-KB/s    in 0.06s   

2025-01-06 03:46:50 (13.9 MB/s) - ‘train.csv’ saved [868985]



In [3]:
DATASET_FILE = "../dataset/train.csv"
df_full = pd.read_csv(DATASET_FILE)

In [4]:
import re

# Function to convert camelCase or PascalCase to snake_case
def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

df_full.columns = [to_snake_case(col) for col in df_full.columns]

In [5]:
df_full.drop(
    columns=["id", "alley", "pool_qc", "fence", "misc_feature", "mas_vnr_type", "fireplace_qu", "lot_frontage"],
    inplace=True
    )
df_full.dropna(inplace=True)

In [6]:
df_full.shape

(1338, 73)

In [7]:
TARGET_COLUMN = "sale_price"

df_full[TARGET_COLUMN] = np.log1p(df_full[TARGET_COLUMN])

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_full_train, df_test = train_test_split(df_full, test_size=0.2, random_state=RANDOM_SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=RANDOM_SEED)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train[TARGET_COLUMN]).astype('int').values
y_val = (df_val[TARGET_COLUMN]).astype('int').values
y_test = (df_test[TARGET_COLUMN]).astype('int').values

del df_train[TARGET_COLUMN]
del df_val[TARGET_COLUMN]
del df_test[TARGET_COLUMN]

In [10]:
from sklearn.preprocessing import StandardScaler

numerical_cols = df_train.select_dtypes(include=['number']).columns
scaler = StandardScaler()

df_train[numerical_cols] = scaler.fit_transform(df_train[numerical_cols])
df_val[numerical_cols] = scaler.transform(df_val[numerical_cols])

In [11]:
from sklearn.feature_extraction import DictVectorizer

categorical_cols = df_train.select_dtypes(include=['object']).columns
dv = DictVectorizer(sparse=False)


In [12]:
# import RegressionExperiment and init the class
from pycaret.regression import RegressionExperiment
exp = RegressionExperiment()

In [13]:
# init setup on exp
exp.setup(df_train, target = y_train, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Regression
3,Original data shape,"(802, 73)"
4,Transformed data shape,"(802, 241)"
5,Transformed train set shape,"(561, 241)"
6,Transformed test set shape,"(241, 241)"
7,Numeric features,35
8,Categorical features,37
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x7ee538902d10>

In [14]:
best = exp.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,0.162,0.0801,0.2796,0.7136,0.0222,0.014,0.91
lightgbm,Light Gradient Boosting Machine,0.1874,0.0844,0.2881,0.6974,0.0228,0.0162,0.668
gbr,Gradient Boosting Regressor,0.184,0.086,0.2912,0.6928,0.0231,0.0159,0.723
et,Extra Trees Regressor,0.1754,0.0916,0.301,0.6732,0.0239,0.0152,0.812
xgboost,Extreme Gradient Boosting,0.1752,0.0988,0.3107,0.6457,0.0247,0.0151,0.743
knn,K Neighbors Regressor,0.1523,0.1001,0.3139,0.6435,0.0248,0.013,0.453
br,Bayesian Ridge,0.2333,0.1019,0.3152,0.6362,0.025,0.0202,0.447
omp,Orthogonal Matching Pursuit,0.2467,0.1093,0.3278,0.6098,0.026,0.0214,0.414
ada,AdaBoost Regressor,0.2754,0.1191,0.3435,0.5744,0.0273,0.0239,0.521
ridge,Ridge Regression,0.2563,0.1366,0.3623,0.5138,0.0288,0.0223,0.441


## auxiliar functions

In [15]:
from sklearn.pipeline import Pipeline

def evaluate_model(model, X_train, y_train, X_val, y_val, params, score_function):
    """
    # Función para evaluar un conjunto de hiperparámetros
    """
    pipeline = Pipeline([
        ('logistic', model(**params))
    ])    
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    accuracy = score_function(y_val, y_val_pred)
    return accuracy, pipeline

In [16]:
from itertools import product

def find_best_model(
        Model,
        parameter_grid,
        X_train,
        y_train,
        X_val,
        y_val,
        score_function,
        verbose=False):
    best_accuracy = -np.inf
    best_params = None
    best_model = None

    parameter_labels = parameter_grid.keys()
    parameter_values = parameter_grid.values()

    for temp_parameter_iterable in product(*parameter_values):
        params = { label:value for label, value in zip(parameter_labels, temp_parameter_iterable) }
        if verbose:
            print()
            print(params)

        # Evaluamos los parámetros
        try:
            accuracy, model = evaluate_model(
                Model, X_train, y_train, X_val, y_val, params, score_function
            )
        except ValueError as ve:
            if verbose:
                print(ve)
            continue

        if verbose:
            print(f"accuracy: {accuracy}")
        
        # Actualizamos mejor modelo si es necesario
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_model = model

    return best_model, best_params, best_accuracy

In [17]:
from sklearn.metrics import root_mean_squared_error


def score_function_regression(y_true, y_pred):
    return -1 * root_mean_squared_error(y_true, y_pred)

In [18]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [34]:
# model 1 lienar regression
from mlmodels.skmodels import ZCLinearRegression


parameter_grid_linear_regression = {
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'copy_X': [True, False],
    'max_iter': [100, 500, 1000],
    'tol': [1e-3, 1e-4, 1e-5],
    'positive': [True, False],
    'n_jobs': [-1, 1, 2, 4]
}


best_model_linear_regression, best_params_linear_regression, best_mrse_linear_regression = find_best_model(
    ZCLinearRegression,
    parameter_grid_linear_regression,
    X_train,
    y_train,
    X_val,
    y_val,
    score_function_regression,
    verbose=True
)


ImportError: cannot import name 'BasicModel' from 'mlmodels.basicmodel' (/workspaces/house-price-predictor/train/mlmodels/basicmodel.py)