In [14]:
import pandas as pd
import pickle
import pathlib
import numpy as np

# Loading the data

In [15]:
# Define o diretório base para os dados e o caminho do arquivo processado
DATA_DIR = pathlib.Path.cwd() / 'data'
clean_data_path = DATA_DIR / 'ames_clean.pkl'

# Carrega o dataset processado a partir de um arquivo pickle
# Esse dataset já deve ter sido previamente limpo e preparado
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

# Faz uma cópia do dataset carregado para evitar alterações no original
model_data = data.copy()

# Listas para armazenar colunas categóricas nominais e ordinais
categorical_columns = []
ordinal_columns = []

# Identifica as colunas categóricas no dataset
# Divide-as em ordinais (com ordem implícita) e nominais (sem ordem)
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:  # Verifica se a coluna é ordinal
        ordinal_columns.append(col)
    else:  # Caso contrário, considera nominal
        categorical_columns.append(col)

# Codifica as colunas ordinais com valores inteiros preservando a ordem
# Exemplo: ["low", "medium", "high"] → [0, 1, 2]
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

# Aplica One-Hot Encoding nas colunas categóricas nominais
# Cria colunas binárias para cada categoria, exceto a primeira (drop_first=True)
# Isso reduz a multicolinearidade, útil para modelos lineares
model_data = pd.get_dummies(model_data, drop_first=True)

# Splitting the data into train and test

In [22]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42 

X = model_data.drop('SalePrice', axis=1).copy()
y = model_data['SalePrice'].copy()

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=RANDOM_SEED,
)

# Training Different Models

### Função que calcula o RMSE e o erro médio

In [23]:
from sklearn.metrics import mean_squared_error

def print_errors(y_true, y_pred):
    """Shows RMSE and error percentage for a model's predictions."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    error_percent = 100 * (10 ** rmse -1)

    print(f'RMSE: {rmse:.4f}')
    print(f'Average error: {error_percent:.2f}%')

### Regressão Linear

In [43]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(Xtrain, ytrain)

ypred = model.predict(Xtest)

print_errors(ytest, ypred)

RMSE: 0.0583
Average error: 14.37%


### Random Forest Regressor

In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf_model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],    
    'max_depth': [10, 20],          
    'min_samples_split': [2, 5],     
    'min_samples_leaf': [2, 4],        
    'bootstrap': [True]
}

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(Xtrain, ytrain)

best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(Xtest)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print_errors(ytest, y_pred)

Best Hyperparameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
RMSE: 0.0565
Average error: 13.90%


### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

dt_model = DecisionTreeRegressor(random_state=42)

param_grid = {
    'max_depth': [10, 20, 30],          
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 2, 4]        
}

grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(Xtrain, ytrain)

best_dt_model = grid_search.best_estimator_

y_pred = best_dt_model.predict(Xtest)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print_errors(ytest, y_pred)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
RMSE: 0.0763
Average error: 19.20%


### Ridge Regression 

In [50]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_model = Ridge(random_state=42)

param_grid = {
    'alpha': [0.1, 1, 10, 100, 1000]
}

grid_search = GridSearchCV(
    estimator=ridge_model,
    param_grid=param_grid,
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(Xtrain, ytrain)

best_ridge_model = grid_search.best_estimator_

y_pred = best_ridge_model.predict(Xtest)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print_errors(ytest, y_pred)


Best Hyperparameters: {'alpha': 1}
RMSE: 0.0582
Average error: 14.33%


### XGBoost Regressor

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

xgb_model = XGBRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],    
    'max_depth': [10, 20],          
    'learning_rate': [0.01, 0.1, 0.3],     
    'subsample': [0.5, 0.7, 1.0],        
    'colsample_bytree': [0.5, 0.7, 1.0],        
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(Xtrain, ytrain)

best_xgb_model = grid_search.best_estimator_

y_pred = best_xgb_model.predict(Xtest)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print_errors(ytest, y_pred)

### Ada Boost Regressor

In [54]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

ada_model = AdaBoostRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],    
    'learning_rate': [0.01, 0.1, 1.0],     
}

grid_search = GridSearchCV(
    estimator=ada_model,
    param_grid=param_grid,
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(Xtrain, ytrain)

best_ada_model = grid_search.best_estimator_

y_pred = best_ada_model.predict(Xtest)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print_errors(ytest, y_pred)

Best Hyperparameters: {'learning_rate': 0.1, 'n_estimators': 200}
RMSE: 0.0738
Average error: 18.53%


# Comparing the models

# Deciding the best model