# XG Boosted Tree

In [1]:
# Install if needed
# pip install xgboost scikit-learn pandas numpy shap joblib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import shap
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Example: load your dataframe (replace with your IO)
# df = pd.read_parquet("path/to/your.parquet")
df = pd.read_csv("../data/curated/rent_features/cleaned_listings.csv")

# Basic checks
assert 'weekly_rent' in df.columns, "DataFrame must contain 'weekly_rent' column as target"
print("Rows:", len(df))
print("Missing by column:\n", df.isna().sum().sort_values(ascending=False).head(20))
print("Target sample stats:\n", df['weekly_rent'].describe())


Rows: 25714
Missing by column:
 last_sold_date         23560
unit_number            19989
features_list          18024
bathrooms              13675
land_area              13669
structured_features    13431
appointment_only       12381
agent_name             12299
description            12048
median_rent_price      12046
median_sold_price      12046
avg_days_on_market     12046
state_abbreviation     12045
agency_name            12045
number_sold            12045
long_term_resident     12045
single_percentage      12045
schools                12045
renter_percentage      12045
first_listed_date      12045
dtype: int64
Target sample stats:
 count    25714.000000
mean       577.180281
std        189.480516
min         18.000000
25%        450.000000
50%        550.000000
75%        650.000000
max       1420.000000
Name: weekly_rent, dtype: float64


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25714 entries, 0 to 25713
Data columns (total 41 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           25714 non-null  int64  
 1   property_id          25714 non-null  int64  
 2   bedrooms             25485 non-null  object 
 3   bathrooms            12039 non-null  float64
 4   car_spaces           24257 non-null  float64
 5   property_type        25714 non-null  object 
 6   land_area            12045 non-null  float64
 7   property_features    25696 non-null  object 
 8   suburb               25714 non-null  object 
 9   postcode             25714 non-null  int64  
 10  year                 25714 non-null  int64  
 11  quarter              25714 non-null  int64  
 12  age_0_to_19          13669 non-null  float64
 13  age_20_to_39         13669 non-null  float64
 14  age_40_to_59         13669 non-null  float64
 15  age_60_plus          13669 non-null 

In [5]:
# numerical columns
numerical_features = [
    'bedrooms', 'bathrooms', 'car_spaces',
    'land_area', 'year', 'quarter',
    'age_0_to_19', 'age_20_to_39', 'age_40_to_59', 'age_60_plus',
    'avg_days_on_market', 'family_percentage', 'long_term_resident', 
    'median_rent_price', 'median_sold_price', 'number_sold', 'renter_percentage',
    'single_percentage'
    ]

In [6]:
# categorical columns
categorical_features = [
    'property_type', 'suburb', 'postcode',
    'agency_name', 'agent_name', 'appointment_only',
    'listing_status', 'state_abbreviation', 'structured_features',
    'unit_number'
]

In [7]:
# Keep only available columns to avoid key errors
numerical_features = [c for c in numerical_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

X = df[numerical_features + categorical_features].copy()
y = df['weekly_rent'].copy()

# Optional: quick target filtering to remove obvious outliers or bad rows
valid_mask = y.notna()
X = X[valid_mask]
y = y[valid_mask]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [9]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
], remainder='drop')


In [10]:
xgb = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([
    ('preproc', preprocessor),
    ('model', xgb)
])

# Fit
pipe.fit(X_train, y_train)

# Predict and evaluate
def evaluate(model, X_tr, y_tr, X_te, y_te):
    yhat_tr = model.predict(X_tr)
    yhat_te = model.predict(X_te)
    metrics = {
        'train_mae': mean_absolute_error(y_tr, yhat_tr),
        'test_mae': mean_absolute_error(y_te, yhat_te),
        'train_rmse': np.sqrt(mean_squared_error(y_tr, yhat_tr)),
        'test_rmse': np.sqrt(mean_squared_error(y_te, yhat_te)),
        'train_r2': r2_score(y_tr, yhat_tr),
        'test_r2': r2_score(y_te, yhat_te),
    }
    return metrics

metrics = evaluate(pipe, X_train, y_train, X_test, y_test)
print(metrics)


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: '208ha'

In [None]:
param_dist = {
    'model__n_estimators': [100, 200, 400, 800],
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'model__max_depth': [3, 5, 6, 8],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.5, 1.0],
    'model__reg_lambda': [1.0, 2.0, 5.0],
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=40,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)
print("Best score", search.best_score_)
print("Best params", search.best_params_)

best_model = search.best_estimator_
metrics_tuned = evaluate(best_model, X_train, y_train, X_test, y_test)
print(metrics_tuned)


In [None]:
# Save the entire pipeline (preprocessor + model)
joblib.dump(best_model, "xgb_weekly_rent_pipeline.joblib")

# Load for later inference
# loaded = joblib.load("xgb_weekly_rent_pipeline.joblib")
# preds = loaded.predict(new_dataframe_with_required_columns)
