In [126]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 200)
pd.set_option("display.float_format", "{:,.2f}".format)



In [127]:
import pandas as pd

path = r"G:\My Drive\Github\py-2025-epge-dados1-finalassignment\data\input\resultados_merged.parquet"

df = pd.read_parquet(path)
df.shape



(27928, 56)

In [128]:
# Passo 1 — Definir o target (y)
import numpy as np

y = np.log(df["valor_oferta"])


In [129]:
#2.1 Criar variáveis derivadas mínimas

# 2.1 Criar variáveis derivadas mínimas

df["uf_cidade"] = df["uf.x"].astype(str) + "_" + df["cidade.x"].astype(str)

# Converter data_licitacao para datetime (tenta ISO e depois dia/mês/ano)
d = pd.to_datetime(df["data_licitacao"], errors="coerce")
if d.isna().mean() > 0.5:
    d = pd.to_datetime(df["data_licitacao"], errors="coerce", dayfirst=True)

df["data_licitacao_dt"] = d
df["ano"] = df["data_licitacao_dt"].dt.year
df["mes"] = df["data_licitacao_dt"].dt.month

count_cols = [ "area_total", "area_privativa", "area_terreno", "quartos", "salas", "vagas_garagem", 
              "lavabos", "suites", "cozinhas", "varandas", "sacadas", "terracos", "areas_servico", 
              "dce", "churrasqueiras", "wc", "wc_emp"
]

for c in count_cols:
    df[f"{c}_missing"] = df[c].isna().astype(int)

# 2.2 Definir features (baseline)
FEATURES = [
    # categóricas
    "uf.x",
    "modalidade_de_venda.x",
    "tipo",

    # tempo
    "ano",
    "mes",

    # preços
    "preco",
    "valor_de_avaliacao",

    # áreas
    "area_total", "area_total_missing",
    "area_privativa", "area_privativa_missing",
    "area_terreno", "area_terreno_missing",

    # contagens principais
    "quartos", "quartos_missing",
    "salas", "salas_missing",
    "vagas_garagem", "vagas_garagem_missing",
    "lavabos", "lavabos_missing",
    "suites", "suites_missing",

    # ambientes / amenidades (quantidade + missing)
    "cozinhas", "cozinhas_missing",
    "varandas", "varandas_missing",
    "sacadas", "sacadas_missing",
    "terracos", "terracos_missing",
    "areas_servico", "areas_servico_missing",
    "dce", "dce_missing",
    "churrasqueiras", "churrasqueiras_missing",
    "wc", "wc_missing",
    "wc_emp", "wc_emp_missing"
]

X = df[FEATURES]
X.shape




(27928, 41)

In [130]:
is_test = (df["ano"] == 2024)

X_train = X.loc[~is_test].copy()
y_train = y.loc[~is_test].copy()

X_test  = X.loc[is_test].copy()
y_test  = y.loc[is_test].copy()

X_train.shape, X_test.shape

medians = X_train[count_cols].median()

for c in count_cols:
    X_train[c] = X_train[c].fillna(medians[c])
    X_test[c]  = X_test[c].fillna(medians[c])


In [131]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


In [132]:
cat_cols = ["uf.x", "modalidade_de_venda.x", "tipo"]

num_cols = [
    "ano", "mes",
    "preco", "valor_de_avaliacao",

    "area_total", "area_total_missing",
    "area_privativa", "area_privativa_missing",
    "area_terreno", "area_terreno_missing",

    "quartos", "quartos_missing",
    "salas", "salas_missing",
    "vagas_garagem", "vagas_garagem_missing",
    "lavabos", "lavabos_missing",
    "suites", "suites_missing",

    "cozinhas", "cozinhas_missing",
    "varandas", "varandas_missing",
    "sacadas", "sacadas_missing",
    "terracos", "terracos_missing",
    "areas_servico", "areas_servico_missing",
    "dce", "dce_missing",
    "churrasqueiras", "churrasqueiras_missing",
    "wc", "wc_missing",
    "wc_emp", "wc_emp_missing",
]


In [133]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

model = LinearRegression()

pipe = Pipeline(
    steps=[
        ("prep", preprocess),
        ("model", model),
    ]
)


In [134]:
pipe.fit(X_train, y_train)


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [135]:
y_pred = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

rmse, mae, r2


(np.float64(1.1656043001221121), 0.2715104583382328, -2.892910154809529)