# XG Boosted Tree

In [3]:
# Install if needed
# pip install xgboost scikit-learn pandas numpy shap joblib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import shap
import warnings
warnings.filterwarnings("ignore")


In [4]:
# Example: load your dataframe (replace with your IO)
# df = pd.read_parquet("path/to/your.parquet")
df = pd.read_csv("../data/processed/domain/rental_listings_summary_cleaned.csv")

# Basic checks
assert 'weekly_rent' in df.columns, "DataFrame must contain 'weekly_rent' column as target"
print("Rows:", len(df))
print("Missing by column:\n", df.isna().sum().sort_values(ascending=False).head(20))
print("Target sample stats:\n", df['weekly_rent'].describe())


AssertionError: DataFrame must contain 'weekly_rent' column as target

In [None]:
# Replace these with your actual columns
numerical_features = ['bedrooms', 'bathrooms', 'land_size_sqm', 'building_area_sqm', 'distance_to_cbd_km', 'age_years']
categorical_features = ['property_type', 'suburb', 'heating_type', 'furnished_flag']

# Keep only available columns to avoid key errors
numerical_features = [c for c in numerical_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

X = df[numerical_features + categorical_features].copy()
y = df['weekly_rent'].copy()

# Optional: quick target filtering to remove obvious outliers or bad rows
valid_mask = y.notna()
X = X[valid_mask]
y = y[valid_mask]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
], remainder='drop')


In [None]:
xgb = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([
    ('preproc', preprocessor),
    ('model', xgb)
])

# Fit
pipe.fit(X_train, y_train)

# Predict and evaluate
def evaluate(model, X_tr, y_tr, X_te, y_te):
    yhat_tr = model.predict(X_tr)
    yhat_te = model.predict(X_te)
    metrics = {
        'train_mae': mean_absolute_error(y_tr, yhat_tr),
        'test_mae': mean_absolute_error(y_te, yhat_te),
        'train_rmse': np.sqrt(mean_squared_error(y_tr, yhat_tr)),
        'test_rmse': np.sqrt(mean_squared_error(y_te, yhat_te)),
        'train_r2': r2_score(y_tr, yhat_tr),
        'test_r2': r2_score(y_te, yhat_te),
    }
    return metrics

metrics = evaluate(pipe, X_train, y_train, X_test, y_test)
print(metrics)


In [None]:
param_dist = {
    'model__n_estimators': [100, 200, 400, 800],
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'model__max_depth': [3, 5, 6, 8],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.5, 1.0],
    'model__reg_lambda': [1.0, 2.0, 5.0],
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=40,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)
print("Best score", search.best_score_)
print("Best params", search.best_params_)

best_model = search.best_estimator_
metrics_tuned = evaluate(best_model, X_train, y_train, X_test, y_test)
print(metrics_tuned)


In [None]:
# Save the entire pipeline (preprocessor + model)
joblib.dump(best_model, "xgb_weekly_rent_pipeline.joblib")

# Load for later inference
# loaded = joblib.load("xgb_weekly_rent_pipeline.joblib")
# preds = loaded.predict(new_dataframe_with_required_columns)
