In [1]:
random_state = 9999
image_output_params = {'width': 1080, 'height': 600, 'scale': 6}

# Machine-Learning
Set-up

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split # Random Split
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn.inspection import permutation_importance


import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('data/clean_listing.csv', index_col=0)
# Split data and drop unnecessary data
Y = df['price']
X = df.drop(['price', 'latitude', 'longitude'], axis=1)
X = X.astype('float64')
X.head()
# Storage
result = {'Method':[], 'Train R2':[], 'Train RMSE':[], 'Test R2': [], 'Test RMSE': []}

In [4]:
f"The standard deviation of price of listing is ${df['price'].std():.2f}."

'The standard deviation of price of listing is $132.36.'

In [5]:
def update_frame(frame: dict, y_train, y_train_pred, y_true, y_pred, method_name):
    frame['Method'].append(method_name)
    frame['Train R2'].append(r2_score(y_train, y_train_pred))
    frame['Train RMSE'].append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
    frame['Test R2'].append(r2_score(y_true, y_pred))
    frame['Test RMSE'].append(np.sqrt(mean_squared_error(y_true, y_pred)))

def calculate_score(y_true, y_pred):
    print(f"R2 Score {r2_score(y_true, y_pred)}")
    print(f"RMSE : {np.sqrt(mean_squared_error(y_true, y_pred))}")

def plot(y_true, y_pred, title, band=True):
    # Plotly express plot
    h = pd.DataFrame(data={'Actual y':y_true, 'Predicted y': y_pred})
    RMSE = df['price'].std()

    fig = px.scatter(h, x='Actual y', y='Predicted y', title=title)
    fig.add_trace(go.Scatter(x=[0, 600],y=[0, 600],mode="lines",line=go.scatter.Line(color='gray'),showlegend=False)) #Diagonal Line
    if band:
        fig.add_traces([
            go.Scatter(x=[0, 600],y=[0+RMSE, 600+RMSE],mode='lines',line=go.scatter.Line(dash='dot'),showlegend=False),
            go.Scatter(x=[0, 600],y=[0-RMSE, 600-RMSE],mode='lines',line=go.scatter.Line(dash='dot'),showlegend=False)
            ])
    fig.show()
    return fig

def simulate(model, XXyy, name: str):
    # Model must be fitted and implement predict, score methods.
    # XXyy: X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = XXyy
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    update_frame(result, y_train, y_train_pred, y_test, y_pred, name)

    print(f"Train\n Score: {model.score(X_train, y_train)}")
    calculate_score(y_train, y_train_pred)

    print(f"Test\n Score: {model.score(X_test, y_test)}")
    calculate_score(y_test, y_pred)
    
    fig = plot(y_test, y_pred, name)
    fig.write_image(f'images/{name}.png', **image_output_params)

def feature_plot(feature_name, feature_score, name: str):
    feature_importance = pd.DataFrame({'Feature': feature_name, 'Score': feature_score})
    feature_importance.sort_values(by='Score', axis=0, inplace=True, ascending=False)

    fig = px.bar(feature_importance.head(20), x='Score', y='Feature', orientation='h')
    fig.update_layout(title={'text': f'{name} Most Relevant Feature'})
    fig.show()
    fig.write_image(f'images/{name} Most Relevant Feature.png', **image_output_params)
    fig = px.bar(feature_importance.tail(20), x='Score', y='Feature', orientation='h')
    fig.update_layout(title={'text': f'{name} Least Relevant Feature'})
    fig.show()
    fig.write_image(f'images/{name} Least Relevant Feature.png', **image_output_params)
        

# Part 1:
Finding a model that best predicts price. We will being trying :
- linear model: Linear Regression
- tree-based model: HistGradientBoostRegressor, XGBoost
- and to try something new: Neural Network.

We will be using $R^2$ score to compare the goodness of the model, and RMSE to find the accuracy of the prediction.

### Model 1: Linear Regression

We first use linear regression, a method taught for predicting numerical values.

In [6]:
XXyy = X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

simulate(model, XXyy, 'Linear Regression')
feature_plot([col for col in X.columns], [model.coef_[i].round(5) for i in range(len(X.columns))], 'Linear Regression')

Train
 Score: 0.6185228181568949
R2 Score 0.6185228181568949
RMSE : 82.29618881032178
Test
 Score: 0.5779888564116571
R2 Score 0.5779888564116571
RMSE : 83.35227735073117


### Model 2: Gradient Boost Regressor

We can use gradient boosting - a type of ensemble machine learning algorithms. Ensemble is a collection of decision trees. Every iteration, a decision tree is added to model to minimise the error. Models are fit using any arbitrary differentiable loss function and gradient descent optimization algorithm, and the goal is to minimise loss gradient. We will be using HistGradientBoostingRegressor from sklearn.

In [8]:
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor(random_state=random_state, max_iter=1000)

model.fit(X_train, y_train)

simulate(model, XXyy, name='Gradient Boost Regressor')

feature_plot(model.feature_names_in_, 
             permutation_importance(model, X_test, y_test, random_state=random_state).importances_mean, 
             'Gradient Boost Regressor')

Train
 Score: 0.9999217830187603
R2 Score 0.9999217830187603
RMSE : 1.178408769113613
Test
 Score: 0.7147757332953467
R2 Score 0.7147757332953467
RMSE : 68.52495306058684


### Model 3: XGBoost Regression

We can use gradient boosting - a type of ensemble machine learning algorithms. Ensemble is a collection of decision trees. Every iteration, a decision tree is added to model to minimise the error. Models are fit using any arbitrary differentiable loss function and gradient descent optimization algorithm, and the goal is to minimise loss gradient. We will use XGBoost API to do the model fitting.

In [9]:
import xgboost as xgb

model = xgb.sklearn.XGBRegressor(tree_method='hist', objective='reg:squarederror', n_estimators = 2500,
                         learning_rate=0.01, random_state=random_state)

model.fit(X_train, y_train)

simulate(model, XXyy, 'XGBoost Regressor')

booster = model.get_booster()
feature_plot(booster.get_score().keys(), booster.get_score().values(), 'XGBoost Regressor')

Train
 Score: 0.9904317474468289
R2 Score 0.9904317474468289
RMSE : 13.033520980706674
Test
 Score: 0.705241733013719
R2 Score 0.705241733013719
RMSE : 69.66080804119946


### Model 4: Neural Networks

We will be using Sklearn MLPRegressor, short for Multi-layer Perception Regressor. Model optimizes for lowest squarer error by gradient descent.

In [10]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(random_state=random_state, max_iter=1000).fit(X_train, y_train)

simulate(model, XXyy, 'MLP Regressor')

Train
 Score: 0.7678853210043803
R2 Score 0.7678853210043803
RMSE : 64.19434079879785
Test
 Score: 0.6443170607563389
R2 Score 0.6443170607563389
RMSE : 76.5221260462495


We can conclude that HistGradientBoostingRegressor and XGBoost Regressor achieve the best results, with the lowest RMSE (~65) and the higher R^2 score (~0.74). All other models achieve decent results, with RMSE around half of standard deviation, and R^2 > 0.5.

# Part 2: Optimising models

We will be using RandomizedSearchCV, GridSearchCV to find the optimal parameters.
- Benefit: Can finds optimal values between the 'grid' in a GridSearchCV, do not need prior experience & intuition.
- Downside: Takes significantly longer to find optimal values.

In [11]:
from scipy.stats import loguniform
# Code from scikit-learn-mooc
class loguniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

In [12]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

model = HistGradientBoostingRegressor(random_state=random_state)

param_distributions = {
    'max_bins': loguniform_int(2, 255),
    'max_leaf_nodes': loguniform_int(2, 256),
    'min_samples_leaf': loguniform_int(1, 100),
    'learning_rate': loguniform(0.001, 10),
}

cv = RandomizedSearchCV(model, param_distributions=param_distributions, n_iter=350, cv=3, n_jobs=-1, scoring=['neg_mean_squared_error', 'r2'], refit='r2') .fit(X_train, y_train)

v = pd.DataFrame(cv.cv_results_)
v.sort_values(by='rank_test_r2', inplace=True)
display(v.head(10))

print(f"Best parameters: {cv.best_params_}")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_bins,param_max_leaf_nodes,param_min_samples_leaf,params,split0_test_neg_mean_squared_error,...,split2_test_neg_mean_squared_error,mean_test_neg_mean_squared_error,std_test_neg_mean_squared_error,rank_test_neg_mean_squared_error,split0_test_r2,split1_test_r2,split2_test_r2,mean_test_r2,std_test_r2,rank_test_r2
141,0.335587,0.004977,0.007483,0.000191,0.140852,13,29,2,"{'learning_rate': 0.14085197310400102, 'max_bi...",-4449.293726,...,-4303.753166,-4750.335986,531.979297,1,0.743192,0.715683,0.735763,0.731546,0.01162,1
111,0.555575,0.033064,0.008607,0.002567,0.05612,5,67,4,"{'learning_rate': 0.056120087689263576, 'max_b...",-4577.80017,...,-4025.464797,-4832.181351,783.460227,2,0.735775,0.69524,0.752849,0.727955,0.02416,2
11,0.825227,0.052004,0.011681,0.002047,0.077518,46,161,12,"{'learning_rate': 0.07751820521492997, 'max_bi...",-4387.121308,...,-4339.181139,-4855.314301,696.298811,3,0.74678,0.698014,0.733588,0.726127,0.020596,3
107,0.553764,0.042688,0.00819,0.001262,0.047275,15,60,12,"{'learning_rate': 0.04727479742887529, 'max_bi...",-4505.034835,...,-4319.733268,-4876.705332,660.992648,4,0.739975,0.699787,0.734782,0.724848,0.017847,4
148,0.823028,0.013281,0.004939,9.4e-05,0.069284,251,27,2,"{'learning_rate': 0.06928391024448323, 'max_bi...",-4439.865715,...,-4377.520744,-4904.921719,702.234464,5,0.743736,0.695028,0.731234,0.723333,0.020655,5
45,0.895846,0.016097,0.010893,0.002259,0.08552,14,115,1,"{'learning_rate': 0.08551958447304087, 'max_bi...",-4125.131291,...,-4403.616851,-4922.374613,937.472734,7,0.761902,0.677394,0.729632,0.722976,0.03482,6
190,0.511798,0.027087,0.005775,0.000165,0.116419,67,36,8,"{'learning_rate': 0.11641903167204896, 'max_bi...",-4592.53116,...,-4330.084874,-4917.023013,653.323802,6,0.734924,0.698592,0.734146,0.722554,0.016947,7
198,0.748778,0.016717,0.013817,0.003545,0.041237,5,205,10,"{'learning_rate': 0.04123743715210744, 'max_bi...",-4737.986293,...,-4216.709275,-4924.492767,667.207284,8,0.726529,0.699092,0.741107,0.722243,0.017418,8
187,0.986932,0.063294,0.010906,0.000826,0.04437,5,131,2,"{'learning_rate': 0.04436991000452049, 'max_bi...",-4472.348066,...,-4389.092069,-4932.674737,710.684347,9,0.741861,0.693,0.730524,0.721795,0.02088,9
279,1.044041,0.038474,0.009762,0.003729,0.130539,87,69,4,"{'learning_rate': 0.1305389024853232, 'max_bin...",-4726.51065,...,-4154.209168,-4938.872008,742.709088,11,0.727191,0.693036,0.744945,0.721724,0.021541,10


Best parameters: {'learning_rate': 0.14085197310400102, 'max_bins': 13, 'max_leaf_nodes': 29, 'min_samples_leaf': 2}


We can see that parameters with rank 1 has higher R^2 score. 

Note that due to cross-validation, R^2 score in GridSearchCV is lower than the above example, however it still shows the better parameters.

In [13]:
model = HistGradientBoostingRegressor(**cv.best_params_, random_state=random_state).fit(X_train, y_train)
simulate(model, XXyy, 'CV Gradient Boost Regressor')

Train
 Score: 0.9737201147702319
R2 Score 0.9737201147702319
RMSE : 21.60017016529359
Test
 Score: 0.7027618558663589
R2 Score 0.7027618558663589
RMSE : 69.95323142274377


### Tuning XGBoost Regressor

Parameters to tune:
- n estimators & learning_rate: Takes longer to achieve same error reduction, however smaller steps taken means we can find the optimal minimum
- max_leaves (default 0): By restricting max leaves, we can reduce overfitting.
- colsample_bytree (default 1): Fraction of columns to be randomly sampled, might reduce overfitting.
- subsample (default 1): Fraction of observations to sample for each tree, lower values reduce overfitting.

In [14]:
model = xgb.XGBRegressor(tree_method='hist', objective='reg:squarederror', seed=random_state)

params_grid = {
    'n_estimators': [1000, 1500],
    'learning_rate': [0.03, 0.05],
    'max_leaves': [0, 20, 40, 60, 80],
    'colsample_bytree': [0.4, 0.6, 0.8],
    'subsample': [0.6, 0.8, 1],
}

cv = GridSearchCV(estimator=model, param_grid=params_grid, cv=3, n_jobs=-1, scoring=['neg_mean_squared_error', 'r2'], refit='r2').fit(X_train, y_train)

v = pd.DataFrame(cv.cv_results_)
v.sort_values(by='rank_test_r2', inplace=True)
display(v.head(10))

print(f"Best parameters: {cv.best_params_}")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_leaves,param_n_estimators,param_subsample,params,...,split2_test_neg_mean_squared_error,mean_test_neg_mean_squared_error,std_test_neg_mean_squared_error,rank_test_neg_mean_squared_error,split0_test_r2,split1_test_r2,split2_test_r2,mean_test_r2,std_test_r2,rank_test_r2
148,4.495689,0.068707,0.03845,0.003114,0.8,0.03,80,1500,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",...,-4044.48198,-4545.737941,647.482139,1,0.761463,0.717646,0.751682,0.743597,0.01878,1
124,4.629255,0.073547,0.038911,0.002696,0.8,0.03,0,1500,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",...,-4044.48198,-4545.737941,647.482139,1,0.761463,0.717646,0.751682,0.743597,0.01878,1
145,3.003223,0.044337,0.028072,0.002445,0.8,0.03,80,1000,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",...,-4043.879369,-4547.912513,648.043909,3,0.761214,0.717501,0.751719,0.743478,0.018773,3
121,3.109123,0.02472,0.0273,0.000481,0.8,0.03,0,1000,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",...,-4043.879369,-4547.912513,648.043909,3,0.761214,0.717501,0.751719,0.743478,0.018773,3
142,4.567528,0.110845,0.039153,0.002986,0.8,0.03,60,1500,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",...,-4003.070719,-4554.022164,675.948355,5,0.760296,0.715266,0.754224,0.743262,0.019951,5
139,3.030049,0.06697,0.026049,0.001574,0.8,0.03,60,1000,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",...,-4008.296628,-4557.23112,676.024889,6,0.760246,0.715082,0.753903,0.743077,0.019964,6
135,4.423261,0.051353,0.044821,0.000648,0.8,0.03,40,1500,0.6,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",...,-4007.674682,-4565.764982,703.804253,7,0.761559,0.71255,0.753941,0.742683,0.021533,7
88,4.550499,0.047611,0.036602,0.001286,0.6,0.03,80,1500,0.8,"{'colsample_bytree': 0.6, 'learning_rate': 0.0...",...,-3900.455927,-4574.537686,768.218126,9,0.759098,0.707849,0.760524,0.742491,0.024502,8
64,3.374523,0.040353,0.042722,0.011014,0.6,0.03,0,1500,0.8,"{'colsample_bytree': 0.6, 'learning_rate': 0.0...",...,-3900.455927,-4574.537686,768.218126,9,0.759098,0.707849,0.760524,0.742491,0.024502,8
61,2.172908,0.027806,0.021556,0.002558,0.6,0.03,0,1000,0.8,"{'colsample_bytree': 0.6, 'learning_rate': 0.0...",...,-3899.312943,-4574.974162,767.790882,11,0.758926,0.707876,0.760595,0.742466,0.024468,10


Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_leaves': 0, 'n_estimators': 1500, 'subsample': 0.8}


In [15]:
model = xgb.XGBRegressor(**cv.best_params_,).fit(X_train, y_train)
simulate(model, XXyy, 'CV XGBoost Regressor')

Train
 Score: 0.9991110944599431
R2 Score 0.9991110944599431
RMSE : 3.972585443117286
Test
 Score: 0.7415161213887128
R2 Score 0.7415161213887128
RMSE : 65.23373446204694


For XGBoost Regression, with parameters tuned, it is the best model, with R^2 score of 0.8 and RMSE score of 56.

Using Hyperopt-Sklearn, which is a wrapper of Hyperopt, a library for Distributed Asynchronous Hyper-parameter Optimization.

In [16]:
from hpsklearn import HyperoptEstimator, hist_gradient_boosting_regressor, xgboost_regression
from hyperopt import tpe

model = HyperoptEstimator(
    regressor=hist_gradient_boosting_regressor('HGBR'),
    preprocessing=[],
    algo=tpe.suggest,
    max_evals=20,
    n_jobs=-1)
model.fit(X_train, y_train)

simulate(model, XXyy, 'Hyperopt Gradient Boost Regressor')

100%|██████████| 1/1 [00:01<00:00,  1.25s/trial, best loss: 0.3295883564967361]
100%|██████████| 2/2 [00:00<00:00,  1.04trial/s, best loss: 0.3295883564967361]
100%|██████████| 3/3 [00:00<00:00,  1.15trial/s, best loss: 0.3295883564967361]
100%|██████████| 4/4 [00:00<00:00,  1.07trial/s, best loss: 0.3295883564967361]
100%|██████████| 5/5 [00:00<00:00,  1.04trial/s, best loss: 0.3295883564967361]
100%|██████████| 6/6 [00:00<00:00,  1.03trial/s, best loss: 0.3295883564967361]
100%|██████████| 7/7 [00:00<00:00,  1.10trial/s, best loss: 0.3295883564967361]
100%|██████████| 8/8 [00:01<00:00,  1.09s/trial, best loss: 0.3295883564967361]
100%|██████████| 9/9 [00:00<00:00,  1.05trial/s, best loss: 0.3295883564967361]
100%|██████████| 10/10 [00:00<00:00,  1.06trial/s, best loss: 0.3295883564967361]
100%|██████████| 11/11 [00:00<00:00,  1.06trial/s, best loss: 0.3295883564967361]
100%|██████████| 12/12 [00:00<00:00,  1.02trial/s, best loss: 0.3295883564967361]
100%|██████████| 13/13 [00:00<00:0

In [17]:
model = HyperoptEstimator(
    regressor=xgboost_regression('XGBR'),
    preprocessing=[],
    algo=tpe.suggest,
    max_evals=10,
    n_jobs=-1)
model.fit(X_train, y_train)

simulate(model, XXyy, 'Hyperopt XGBoost Regressor')

100%|██████████| 1/1 [00:01<00:00,  1.40s/trial, best loss: 1.678669890275082]
100%|██████████| 2/2 [00:05<00:00,  5.41s/trial, best loss: 0.2711993720160071]
100%|██████████| 3/3 [00:06<00:00,  6.07s/trial, best loss: 0.2472804477605668]
100%|██████████| 4/4 [00:03<00:00,  3.04s/trial, best loss: 0.2472804477605668]
100%|██████████| 5/5 [00:02<00:00,  2.96s/trial, best loss: 0.2472804477605668]
100%|██████████| 6/6 [00:07<00:00,  7.41s/trial, best loss: 0.2472804477605668]
100%|██████████| 7/7 [00:05<00:00,  5.71s/trial, best loss: 0.2472804477605668]
100%|██████████| 8/8 [00:05<00:00,  5.44s/trial, best loss: 0.2472804477605668]
100%|██████████| 9/9 [00:07<00:00,  7.75s/trial, best loss: 0.2472804477605668]
100%|██████████| 10/10 [00:02<00:00,  2.40s/trial, best loss: 0.2472804477605668]
Train
 Score: 0.9700878145495286
R2 Score 0.9700878145495286
RMSE : 23.04461796482208
Test
 Score: 0.7455925180734144
R2 Score 0.7455925180734144
RMSE : 64.71730894859594


From the optimization results above, we can see that achieving ~63 RMSE, ~0.75 $R^2$ is the best result we can get, the improvement is marginal.

From the plots and the RMSE, $R^2$ values, we can see that the prediction runs well when $0 \leq price \leq 220$ and as the price increases past 400, the price prediction is significantly further actual price. One reason for this would be that the listing is inaccurate, as not all listings in the dataset are successfully rented, the price of the rental house is artifically increased while providing no other benefits from houses with similiar 'criterias'. Another reason would be that any speciality of the house, like renovation, is only observable through listing picture and cannot be analyzed numerically or categorically. Further improvements can be made if we attempt to anaylze listing description using sentiment analysis etc.

In [18]:
# Save for analysis
pd.DataFrame(result).to_csv('data/model_results.csv')