IMPORTS:

In [9]:
import joblib
import numpy as np
import os
import pandas as pd
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor

UPDATE the below paths for your INPUT data

In [10]:
# INPUT = "../data/processed/immo_train_data.csv"
INPUT = "../data/processed/immo_test_data.csv"
df = pd.read_csv(INPUT)

PREPROCESSING

In [11]:
# Preprocess data
def preprocess(df, target='price', fit=True, preprocessor=None, save_path='../data/processed/df_ml_ready.csv'):
    # Copy input
    df_copy = df.copy()

    # Target
    y = df_copy[target]

    # Calculate & add price per sqm
    df_copy['price_per_sqm'] = y / df_copy['total_area_sqm']
    df_copy['price_per_sqm'].replace([np.inf, -np.inf], np.nan, inplace=True)

    # EPC mapping
    epc_mapping = {
        "Flanders": {"A+": "excellent", "A": "excellent", "B": "good",
                     "C": "poor", "D": "poor", "E": "bad", "F": "bad"},
        "Brussels-Capital": {"A": "excellent", "B": "good", "C": "good",
                              "D": "poor", "E": "poor", "F": "bad", "G": "bad"},
        "Wallonia": {"A++": "excellent", "A+": "excellent", "A": "good",
                     "B": "good", "C": "poor", "D": "poor", "E": "poor",
                     "F": "bad", "G": "bad"}
    }
    df_copy['epc_recoded'] = df_copy.apply(
        lambda row: epc_mapping.get(row['region'], {}).get(row['epc'], 'MISSING'),
        axis=1
    )

    # Column groupings
    binary_flags = ['fl_furnished', 'fl_open_fire', 'fl_terrace',
                    'fl_garden', 'fl_swimming_pool', 'fl_floodzone', 'fl_double_glazing']
    numeric_to_scale = ['construction_year', 'total_area_sqm', 'surface_land_sqm',
                        'nbr_frontages', 'terrace_sqm', 'garden_sqm',
                        'primary_energy_consumption_sqm', 'nbr_bedrooms', 'price_per_sqm']
    categorical_cols = ['property_type', 'subproperty_type', 'region', 'province',
                        'equipped_kitchen', 'state_building', 'heating_type', 'epc_recoded']

    # Transformers
    continuous_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    binary_transformer = 'passthrough'
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    # ColumnTransformer
    if preprocessor is None and fit:
        preprocessor = ColumnTransformer(transformers=[
            ('num_scaled', continuous_transformer, numeric_to_scale),
            ('bin_flags', binary_transformer, binary_flags),
            ('cat', categorical_transformer, categorical_cols)
        ], remainder='passthrough')

    # Fit/transform or just transform
    if fit:
        X_transformed = preprocessor.fit_transform(df_copy)
    else:
        X_transformed = preprocessor.transform(df_copy)

    # Feature names
    cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    scaled_features = [f"{col}_scaled" for col in numeric_to_scale]
    all_features = scaled_features + binary_flags + cat_features.tolist() + \
                   [col for col in df_copy.columns if col not in numeric_to_scale + binary_flags + categorical_cols]

    # Build DataFrame
    df_final = pd.DataFrame(X_transformed, columns=all_features)

    # Convert all to numeric and fill NaNs
    df_numeric = df_final.apply(pd.to_numeric, errors='coerce')
    df_numeric = df_numeric.fillna(df_numeric.median())

    # Drop columns not needed for ML
    columns_to_drop = [
        'id', 'construction_year', 'total_area_sqm', 'surface_land_sqm', 'property_type',
        'subproperty_type', 'nbr_frontages', 'terrace_sqm', 'garden_sqm', 'primary_energy_consumption_sqm',
        'nbr_bedrooms', 'price_per_sqm', 'epc', 'locality'
    ]
    df_ml_ready = df_numeric.drop(columns=[col for col in columns_to_drop if col in df_numeric.columns])

    # Save to CSV
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df_ml_ready.to_csv(save_path, index=False)

    # Return ML-ready DataFrame and fitted preprocessor
    return df_ml_ready, preprocessor

In [None]:
def test_xgb_top20(df_ml_ready, model_path="../models/best_xgb_model_top20.pkl", target_col='price', cols_to_remove=['price_per_sqm_scaled']):
    # Load trained model
    xgb_top20 = joblib.load(model_path)
    
    # Separate target and features
    y_test = df_ml_ready[target_col]
    X_test = df_ml_ready.drop(columns=[target_col], errors='ignore')
    
    # Remove unwanted columns
    X_test = X_test.drop(columns=[c for c in cols_to_remove if c in X_test.columns], errors='ignore')
    
    # Keep only the top 20 features that the model was trained on
    top20_features = xgb_top20.get_booster().feature_names  # will match training top20
    X_test_top20 = X_test[top20_features]
    
    # Predict
    y_pred = xgb_top20.predict(X_test_top20)
    
    # Metrics
    metrics = {
        "test_R2": r2_score(y_test, y_pred),
        "test_RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "test_MAE": mean_absolute_error(y_test, y_pred)
    }
    
    return metrics, y_pred

In [None]:
# COMBINED FUNCTION
# Preprocess input data
df_ml_ready, preprocessor = preprocess(df, fit=True, save_path='../data/processed/df_ml_ready.csv')

# Load saved XGBoost top-20 model
xgb_model = joblib.load("../models/best_xgb_model_top20.pkl")

# Test model on preprocessed data
metrics, y_pred = test_xgb_top20(df_ml_ready, model_path="../models/best_xgb_model_top20.pkl")

# Print metrics
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

test_R2: 0.5118
test_RMSE: 291438.8233
test_MAE: 142617.0701
