In [1]:
!pip install category_encoders scikit-learn pandas



In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def load_data(train_path: str, test_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

train, test = load_data("train.csv", "test.csv")

In [4]:
def plot_data_statistics(df: pd.DataFrame, numerical_cols: list[str]) -> None:
    """
    Plots histograms and a displays column statistics for numerical features.
    """
    df[numerical_cols].hist(bins=30, figsize=(15, 10), edgecolor='black')
    plt.suptitle("Distribution of Numerical Features")
    plt.tight_layout()
    plt.show()
    display(df[numerical_cols].describe().T)


In [None]:
numerical_columns = ['price', 'net_area', 'net_usable_area', 'n_rooms', 'n_bathroom', 'latitude', 'longitude']
plot_data_statistics(train, numerical_columns)

In [6]:
def detect_outliers_iqr(df: pd.DataFrame, columns: list[str]) -> dict[str, pd.DataFrame]:
    """
    Returns a dictionary of outlier DataFrames for each column using the IQR method.
    """
    outliers = {}

    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outlier_rows = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers[col] = outlier_rows

        print(f"{col}: {len(outlier_rows)} outliers")

    return outliers


In [None]:
iqr_outliers = detect_outliers_iqr(train, numerical_columns)

In [None]:
iqr_outliers["price"].sort_values("price").head(10)

Filtering rules infered from statistical analysis and outlier detection:

In [9]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop_duplicates()
    df = df[df["net_area"] > 10]
    df = df[df["net_usable_area"] > 10]
    df = df[(df["n_rooms"] > 0) & (df["n_rooms"] <= 10)]
    df = df[(df["n_bathroom"] >= 0) & (df["n_bathroom"] <= 10)]
    df = df[df["price"] >= 1000]
    df = df[df["net_area"] < 10000]
    df = df[df["net_usable_area"] < 10000]
    price_cap = df["price"].quantile(0.99)
    df = df[df["price"] <= price_cap]
    df = df[(df["latitude"].between(-90, 90)) & (df["latitude"] != 0)]
    df = df[(df["longitude"].between(-180, 180)) & (df["longitude"] != 0)]

    return df

train = clean_data(train)
test = clean_data(test)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from category_encoders import TargetEncoder

target = "price"
categorical_cols = ["type", "sector"]
numerical_cols = [col for col in train.columns if col not in categorical_cols + [target]]
train_cols = categorical_cols + numerical_cols


preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", TargetEncoder(), categorical_cols),
        ("numerical", "passthrough", numerical_cols)
    ]
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor(
        learning_rate=0.1,
        n_estimators=300,
        max_depth=7,
        loss="absolute_error"
    ))
])

In [11]:
pipeline.fit(train[train_cols], train[target])

In [12]:
test_predictions = pipeline.predict(test[train_cols])
test_target = test[target].values

In [13]:
type(test_predictions), type(test_target)

(numpy.ndarray, numpy.ndarray)

In [14]:
y_true = test[target]
X_test = test[train_cols]
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mape = mean_absolute_percentage_error(y_true, y_pred)

print("MAE:", round(mae, 2))
print("RMSE:", round(rmse, 2))
print("MAPE:", round(mape, 4))

MAE: 1983.45
RMSE: 3413.8
MAPE: 0.122


Hyperparameter tuning:

In [None]:
param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [3, 5, 7],
    "model__learning_rate": [0.01, 0.05, 0.1]
}

scorer = make_scorer(mean_absolute_error, greater_is_better=False)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(train[train_cols], train[target])

print("Best parameters found:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)

In [17]:
joblib.dump(pipeline, "model-v1.pkl")

['model-v1.pkl']

Basic testing:

In [18]:
model = joblib.load("model-v1.pkl")

test_samples = [
    {
        "type": "casa",
        "sector": "La Reina",
        "net_usable_area": 90.0,
        "net_area": 100.0,
        "n_rooms": 3,
        "n_bathroom": 2,
        "latitude": -33.45,
        "longitude": -70.65
    },
    {
        "type": "departamento",
        "sector": "Las Condes",
        "net_usable_area": 75.0,
        "net_area": 80.0,
        "n_rooms": 2,
        "n_bathroom": 1,
        "latitude": -33.4089,
        "longitude": -70.5650
    },
    {
        "type": "casa",
        "sector": "Ñuñoa",
        "net_usable_area": 120.0,
        "net_area": 150.0,
        "n_rooms": 4,
        "n_bathroom": 3,
        "latitude": -33.4569,
        "longitude": -70.6032
    },
    {
        "type": "departamento",
        "sector": "Providencia",
        "net_usable_area": 55.0,
        "net_area": 60.0,
        "n_rooms": 2,
        "n_bathroom": 1,
        "latitude": -33.4305,
        "longitude": -70.6167
    },
    {
        "type": "casa",
        "sector": "Vitacura",
        "net_usable_area": 180.0,
        "net_area": 200.0,
        "n_rooms": 5,
        "n_bathroom": 4,
        "latitude": -33.3792,
        "longitude": -70.5781
    },
    {
        "type": "departamento",
        "sector": "Lo Barnechea",
        "net_usable_area": 90.0,
        "net_area": 95.0,
        "n_rooms": 13,
        "n_bathroom": 12,
        "latitude": -33.3490,
        "longitude": -70.5070
    }
]

In [None]:
for i, sample in enumerate(test_samples, 1):
    df = pd.DataFrame([sample])
    prediction = model.predict(df)[0]
    print(f"Example {i}: Predicted price = {round(prediction)}")