In [38]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


pd.set_option('display.max_columns', None)

# Funciones

In [42]:
class FrequencySamplerImputer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=42):
        self.random_state = random_state

    def fit(self, X, y=None):
        x = np.array(X).ravel()
        mask = pd.isna(x)
        observed = x[~mask]

        if observed.size == 0:
            raise ValueError("No hay categorías observadas para imputar.")

        vals, counts = np.unique(observed, return_counts=True)
        self.categories_ = vals
        self.probs_ = counts / counts.sum()
        return self

    def transform(self, X):
        rng = np.random.default_rng(self.random_state)
        x = np.array(X).ravel()
        mask = pd.isna(x)
        n_missing = int(mask.sum())

        if n_missing > 0:
            x[mask] = rng.choice(self.categories_, size=n_missing, replace=True, p=self.probs_)

        return x.reshape(-1, 1)

# Columnas descartadas
Dado el EDA, se muestran las razones por las cuales se descartaron las siguientes variables:

### Target leakage
- SHORT_DESC: Leakage -> Relación 1-1 con CLASS

### IDs
- P_ID: Identificador único
- TAX_MAP: Identificador/código casi único

### Redundancia
- SHORT_DESC 1: Redundante con LEVY_CODE_1

### Dirección exacta / alta cardinalidad / PII (riesgo + no generaliza)
- FORMATED_ADDRESS: Dirección completa (alta cardinalidad, PII).
- STREET: Nombre de calle (alta cardinalidad, PII).
- SUFFIX: Parte de dirección; redundante con STREET (Cramér’s V alto) y poca utilidad.
- CIVIC: Número de dirección; sensible y ligado a dirección (riesgo/atajo).
- FREE_LINE_2: Texto libre de mailing (alta cardinalidad, PII).
- FIRST_NAME: PII (nombre).
- LAST_NAME: PII (apellido).
- COMPANY: PII/alta cardinalidad

### Mailing address
- CIVIC 1
- STREET 1
- S_SUFFIX
- CITY 1
- STATE
- ZIP_POSTAL 1

### Unidades
- unit: Alta cardinalidad
- UNIT: Redundante/relacionada con unit

### Geografía en formato string / derivaciones no usadas
- Property_Location: Se usa solo para extraer lat/lon
- geo_cell: Demasiado granular y poco aporte



# Columnas seleccionadas

In [5]:
target = "CLASS"

num_features_final = [
    "TOTAL_ASSMT",
    "TOTAL_TAXES",
    "TOTAL_EXEMPT",
    "lat",
    "lon",
]

cat_features_final = [
    "LEVY_CODE_1",
    "geo_cluster",
    "ZIP_POSTAL", 
]

features_final = num_features_final + cat_features_final

In [22]:
DATA_PATH = "../data/2024_Property_Tax_Roll.csv"
df_ = pd.read_csv(DATA_PATH)

coords = df_["Property_Location"].str.extract(r"POINT\s*\(\s*([-\d\.]+)\s+([-\d\.]+)\s*\)")
df_.loc[:,"lon"] = pd.to_numeric(coords[0], errors="coerce")
df_.loc[:,"lat"] = pd.to_numeric(coords[1], errors="coerce")

valid = df_[["lat", "lon"]].dropna()
kmeans = KMeans(n_clusters=10, random_state=42)
df_.loc[valid.index, "geo_cluster"] = kmeans.fit_predict(valid)

df_["geo_cluster"] = df_["geo_cluster"].astype("Int64").astype(str)


In [23]:
df_model = df_.dropna(subset=[target]).copy()

X = df_model[features_final].copy()
y = df_model[target].astype("Int64").astype(str)

X.shape, y.shape

((44033, 8), (44033,))

In [24]:
df_model[features_final].isna().sum().sort_values(ascending=False)

ZIP_POSTAL      1436
TOTAL_ASSMT        0
TOTAL_EXEMPT       0
TOTAL_TAXES        0
lat                0
lon                0
LEVY_CODE_1        0
geo_cluster        0
dtype: int64

In [None]:
(X.isnull().mean().mul(100).sort_values(ascending=False))


ZIP_POSTAL      3.26119
TOTAL_ASSMT     0.00000
TOTAL_EXEMPT    0.00000
TOTAL_TAXES     0.00000
lat             0.00000
lon             0.00000
LEVY_CODE_1     0.00000
geo_cluster     0.00000
dtype: float64

# Separación train_test

In [None]:
y_int = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y_int, test_size=0.2, random_state=42, stratify=y_int)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((35226, 8), (8807, 8), (35226,), (8807,))

In [35]:
# Distribución de clases (proporciones) para confirmar estratificación
dist_train = y_train.value_counts(normalize=True).sort_index()
dist_test  = y_test.value_counts(normalize=True).sort_index()

dist_train.head(), dist_test.head()


(CLASS
 1    0.334412
 2    0.317691
 3    0.011696
 4    0.016408
 5    0.000596
 Name: proportion, dtype: float64,
 CLASS
 1    0.334393
 2    0.317702
 3    0.011695
 4    0.016464
 5    0.000568
 Name: proportion, dtype: float64)

## Preprocesamiento

In [40]:
num_features_final = ["TOTAL_ASSMT", "TOTAL_TAXES", "TOTAL_EXEMPT", "lat", "lon"]
cat_features_final = ["LEVY_CODE_1", "geo_cluster"]
zip_feature = ["ZIP_POSTAL"]

numeric_pipe = Pipeline([("imp", SimpleImputer(strategy="median")),])

cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("oh", OneHotEncoder(handle_unknown="ignore")) ])

zip_pipe = Pipeline([("imp_freq", FrequencySamplerImputer(random_state=42)), ("oh", OneHotEncoder(handle_unknown="ignore"))])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_features_final),
    ("cat", cat_pipe, cat_features_final),
    ("zip", zip_pipe, zip_feature),
], remainder="drop")


In [41]:
Xtr = preprocess.fit_transform(X_train, y_train)
Xte = preprocess.transform(X_test)

Xtr.shape, Xte.shape


((35226, 99), (8807, 99))