In [43]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [44]:
import pandas as pd

path = r"G:\My Drive\Github\py-2025-epge-dados1-finalassignment\data\input\resultados_merged.parquet"

df = pd.read_parquet(path)
df.shape

(27928, 56)

In [45]:
# Passo 1 — Definir o target (y)

y = np.log(df["valor_oferta"])


In [46]:
#2.1 Criar variáveis derivadas mínimas

# 2.1 Criar variáveis derivadas mínimas

df["bairro_std"] = df["bairro"].astype(str).str.upper().str.strip()
df.loc[df["bairro"].isna(), "bairro_std"] = "MISSING"

df["uf_cidade"] = df["uf.x"].astype(str) + "_" + df["cidade.x"].astype(str)
df["uf_cidade_bairro"] = df["uf_cidade"] + "_" + df["bairro_std"]

# filtrar bairros raros (ex: >= 30)
counts = df["uf_cidade_bairro"].value_counts()
valid = counts[counts >= 30].index

df["uf_cidade_bairro_f"] = np.where(df["uf_cidade_bairro"].isin(valid),
                                    df["uf_cidade_bairro"],
                                    "OUTROS_BAIRROS")

# Converter data_licitacao para datetime (tenta ISO e depois dia/mês/ano)
d = pd.to_datetime(df["data_licitacao"], errors="coerce")
if d.isna().mean() > 0.5:
    d = pd.to_datetime(df["data_licitacao"], errors="coerce", dayfirst=True)

df["data_licitacao_dt"] = d
df["ano"] = df["data_licitacao_dt"].dt.year
df["mes"] = df["data_licitacao_dt"].dt.month

count_cols = [ "area_total", "area_privativa", "area_terreno", "quartos", "salas", "vagas_garagem", 
              "lavabos", "suites", "cozinhas", "varandas", "sacadas", "terracos", "areas_servico", 
              "dce", "churrasqueiras", "wc", "wc_emp"
]

for c in count_cols:
    df[f"{c}_missing"] = df[c].isna().astype(int)

# 2.2 Definir features (baseline)
FEATURES = [
    "uf_cidade",                 # <- agora entra completo
    "uf_cidade_bairro_f",
    "modalidade_de_venda.x",
    "tipo",
    "ano", "mes",
    #"preco", "valor_de_avaliacao",

    "area_total", "area_total_missing",
    "area_privativa", "area_privativa_missing",
    "area_terreno", "area_terreno_missing",

    "quartos", "quartos_missing",
    "salas", "salas_missing",
    "vagas_garagem", "vagas_garagem_missing",
    "lavabos", "lavabos_missing",
    "suites", "suites_missing",

    "cozinhas", "cozinhas_missing",
    "varandas", "varandas_missing",
    "sacadas", "sacadas_missing",
    "terracos", "terracos_missing",
    "areas_servico", "areas_servico_missing",
    "dce", "dce_missing",
    "churrasqueiras", "churrasqueiras_missing",
    "wc", "wc_missing",
    "wc_emp", "wc_emp_missing",
]

X = df[FEATURES].copy()
X.shape




(27928, 40)

In [47]:
is_test = (df["ano"] == 2024)

X_train = X.loc[~is_test].copy()
y_train = y.loc[~is_test].copy()

X_test  = X.loc[is_test].copy()
y_test  = y.loc[is_test].copy()

X_train.shape, X_test.shape



((18654, 40), (9274, 40))

In [48]:
from catboost import CatBoostRegressor

cat_cols = ["uf_cidade", "uf_cidade_bairro_f", "modalidade_de_venda.x", "tipo"]

cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

model = CatBoostRegressor(
    loss_function="RMSE",
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    verbose=200
)

model.fit(
    X_train, y_train,
    cat_features=cat_idx,
    eval_set=(X_test, y_test),
    use_best_model=True
)



0:	learn: 0.6168616	test: 0.5835007	best: 0.5835007 (0)	total: 102ms	remaining: 3m 23s
200:	learn: 0.3544583	test: 0.4525545	best: 0.4001539 (55)	total: 21.4s	remaining: 3m 11s
400:	learn: 0.3349879	test: 0.4707737	best: 0.4001539 (55)	total: 43.2s	remaining: 2m 52s
600:	learn: 0.3206717	test: 0.4894155	best: 0.4001539 (55)	total: 1m 4s	remaining: 2m 30s
800:	learn: 0.3090864	test: 0.4990213	best: 0.4001539 (55)	total: 1m 26s	remaining: 2m 8s
1000:	learn: 0.2977716	test: 0.4994905	best: 0.4001539 (55)	total: 1m 47s	remaining: 1m 47s
1200:	learn: 0.2889430	test: 0.5038709	best: 0.4001539 (55)	total: 2m 9s	remaining: 1m 26s
1400:	learn: 0.2805271	test: 0.5046616	best: 0.4001539 (55)	total: 2m 31s	remaining: 1m 4s
1600:	learn: 0.2729302	test: 0.5091919	best: 0.4001539 (55)	total: 2m 52s	remaining: 43.1s
1800:	learn: 0.2667624	test: 0.5106062	best: 0.4001539 (55)	total: 3m 14s	remaining: 21.5s
1999:	learn: 0.2614396	test: 0.5118756	best: 0.4001539 (55)	total: 3m 36s	remaining: 0us

bestTes

<catboost.core.CatBoostRegressor at 0x1f71b7b9c70>

In [49]:
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

rmse, mae, r2


(np.float64(0.40015388710955374), 0.2740266496081811, 0.5411970741776237)

In [50]:
fi = pd.DataFrame({
    "feature": X_train.columns,
    "importance": model.get_feature_importance()
}).sort_values("importance", ascending=False)

fi.head(20)


Unnamed: 0,feature,importance
8,area_privativa,27.151696
0,uf_cidade,26.71319
2,modalidade_de_venda.x,11.433197
3,tipo,9.635048
10,area_terreno,8.065801
16,vagas_garagem,2.731309
12,quartos,2.477243
1,uf_cidade_bairro_f,2.476851
36,wc,2.411263
5,mes,1.653985
