# 03 • Tratamento de Campos e Padronização

Objetivo:
- converter `ÁREA TOTAL` -> `AREA_TOTAL_HA` (float)
- `IBGE_MUNICIPIO` (int)
- `PERCENTUAL_DETENCAO` (float)
- criar colunas `*_NORM` (sem acento, UPPER) para busca.

Saída: `data/processed/imoveis_er_tratado.csv`


In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import re, unicodedata

# Caminhos do projeto (ajuste se necessário)
BASE_DIR = Path.cwd().resolve().parents[0]  # .../notebooks -> projeto
RAW_DIR = BASE_DIR / "data" / "raw"
PROCESSED_DIR = BASE_DIR / "data" / "processed"
OUTPUTS_DIR = BASE_DIR / "outputs"

RAW_DIR, PROCESSED_DIR, OUTPUTS_DIR


(WindowsPath('C:/Users/User/Desktop/python/ali_rural_prospecção/data/raw'),
 WindowsPath('C:/Users/User/Desktop/python/ali_rural_prospecção/data/processed'),
 WindowsPath('C:/Users/User/Desktop/python/ali_rural_prospecção/outputs'))

In [2]:
def normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.upper()
    s = re.sub(r"\s+", " ", s)
    return s

MUNICIPIOS_ER = ['Jundiaí', 'Cabreúva', 'Caieiras', 'Cajamar', 'Campo Limpo Paulista', 'Francisco Morato', 'Franco da Rocha', 'Itatiba', 'Itupeva', 'Jarinu', 'Louveira', 'Morungaba', 'Pedra Bela', 'Pinhalzinho', 'Tuiutí', 'Vargem', 'Várzea Paulista', 'Bragança Paulista']
MUNICIPIOS_ER_NORM = set(normalize_text(m) for m in MUNICIPIOS_ER)

sorted(MUNICIPIOS_ER)[:5], len(MUNICIPIOS_ER_NORM)


(['Bragança Paulista',
  'Cabreúva',
  'Caieiras',
  'Cajamar',
  'Campo Limpo Paulista'],
 18)

In [3]:
in_csv = PROCESSED_DIR / "imoveis_er_filtrado.csv"
in_csv.exists(), in_csv

(True,
 WindowsPath('C:/Users/User/Desktop/python/ali_rural_prospecção/data/processed/imoveis_er_filtrado.csv'))

In [8]:
def parse_area_ha(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().replace(".", "").replace(",", ".")
    try:
        return float(s)
    except ValueError:
        return np.nan

def parse_int(x):
    try:
        return int(str(x).strip())
    except Exception:
        return pd.NA

def parse_float(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().replace(",", ".")
    try:
        return float(s)
    except Exception:
        return np.nan

In [5]:
df = pd.read_csv(in_csv, sep=";", dtype=str, low_memory=False)

df["AREA_TOTAL_HA"] = df["ÁREA TOTAL"].map(parse_area_ha)
df["IBGE_MUNICIPIO"] = df["CÓDIGO DO MUNICÍPIO (IBGE)"].map(parse_int)
df["PERCENTUAL_DETENCAO"] = df["PERCENTUAL DE DETENÇÃO"].map(parse_float)

df["TITULAR_NORM"] = df["TITULAR"].map(normalize_text)
df["DENOMINACAO_NORM"] = df["DENOMIÇÃO DO IMÓVEL"].map(normalize_text)
df["MUNICIPIO_NORM"] = df["MUNICÍPIO"].map(normalize_text)

df[["CÓDIGO DO IMOVEL","MUNICÍPIO","AREA_TOTAL_HA","IBGE_MUNICIPIO","PERCENTUAL_DETENCAO"]].head(5)

Unnamed: 0,CÓDIGO DO IMOVEL,MUNICÍPIO,AREA_TOTAL_HA,IBGE_MUNICIPIO,PERCENTUAL_DETENCAO
0,192357411,ITATIBA,4.8,3523404,100.0
1,192690692,JUNDIAÍ,3.2612,3525904,100.0
2,192692717,VARGEM,3.5,3556354,100.0
3,192693101,BRAGANÇA PAULISTA,2.9142,3507605,100.0
4,192701643,CAMPO LIMPO PAULISTA,298.9,3509601,100.0


In [6]:
checks = {
    "area_negativa": int((df["AREA_TOTAL_HA"] < 0).sum()),
    "percentual_maior_100": int((df["PERCENTUAL_DETENCAO"] > 100).sum()),
    "ibge_nulo": int(df["IBGE_MUNICIPIO"].isna().sum()),
}
checks

{'area_negativa': 0, 'percentual_maior_100': 2, 'ibge_nulo': 0}

In [7]:
out_csv = PROCESSED_DIR / "imoveis_er_tratado.csv"
df.to_csv(out_csv, sep=";", index=False, encoding="utf-8")
out_csv

WindowsPath('C:/Users/User/Desktop/python/ali_rural_prospecção/data/processed/imoveis_er_tratado.csv')