# Part 3B - Rental Price Prediction (With Fewer Features)

### Content  
- [Section 1 - Data Pre-processing](#section-1)   
- [Section 2 - Modeling - XGBoost Regressor](#section-2)  
- [Section 3 - Modeling - LightGBM Regressor](#section-3)  

Exclude AMENITIES, and also testing on XGBoost and LightGBM only

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings("ignore")

___
<a name="section-1"></a>
## (i) Data Pre-Processing

In [4]:
condo_df = pd.read_csv('./Data/condo_dataset_preprocessed.csv')

#### Drop unnecessary columns (Exclude amenities as well)

In [5]:
condo_df.columns

Index(['amenities', 'beds', 'developer', 'district', 'electoral_div', 'lease',
       'name', 'nearest_mrt_name', 'neighbourhood', 'property_type', 'rental',
       'sqft', 'tenure', 'travel_time_changi', 'travel_time_orchard',
       'travel_time_raffles', 'furnishing', 'amenities_count',
       'nearest_mrt_dist_min', 'nearest_mrt_dist_metres', 'freehold_status',
       'rental_log'],
      dtype='object')

In [6]:
cols_to_drop = ['name','beds','developer','electoral_div','neighbourhood','rental','nearest_mrt_name','property_type', 'amenities',
               'amenities_count','nearest_mrt_dist_min','nearest_mrt_dist_metres','furnishing','tenure','freehold_status']

In [7]:
condo_df_sm = condo_df.drop(columns = cols_to_drop)
condo_df_sm.head()

Unnamed: 0,district,lease,sqft,travel_time_changi,travel_time_orchard,travel_time_raffles,rental_log
0,D09,Flexible,2100,53,12,16,9.159047
1,D09,Flexible,1399,56,12,13,8.853665
2,D05,greater_than_or_equal_24m,1948,70,27,22,8.732305
3,D22,Flexible,775,85,46,39,7.937375
4,D15,Flexible,635,46,41,34,7.783224


___
<a name="section-2"></a>
## (ii) Modeling - XGBoost Regressor

In [9]:
condo_df = condo_df_sm.copy()

# One-hot encoding of categorical variables
condo_df = pd.get_dummies(condo_df, columns = ['district'], drop_first = False) 
condo_df = pd.get_dummies(condo_df, columns = ['lease'], drop_first = False) 

# Separate dependent(target) variable and independent variables
X = condo_df.drop(['rental_log'], axis = 1)
y = condo_df['rental_log']

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
xgb_param_grid = {'colsample_bytree': [0.4,0.5,0.6,0.7,0.8],
                  'learning_rate' : [0.01,0.1,0.3],
                  'max_depth': [3,4,5,6], 
                  'alpha': [1e-5, 1e-2, 0.1, 1, 100], 
                  'n_estimators':[200]
}

In [25]:
xgb_reg = xgb.XGBRegressor(seed = 42, objective ='reg:squarederror')

xgb_reg_grid = GridSearchCV(estimator = xgb_reg, param_grid = xgb_param_grid, cv=5, n_jobs=-1, verbose=1) 

xgb_reg_grid.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  4.7min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, seed=42,
                                    subsample=None, tree_method=None,
      

In [26]:
xgb_reg_grid.best_params_

{'alpha': 0.01,
 'colsample_bytree': 0.7,
 'learning_rate': 0.3,
 'max_depth': 6,
 'n_estimators': 200}

In [27]:
best_xgb_reg = xgb_reg_grid.best_estimator_
y_pred_test = best_xgb_reg.predict(X_test)

In [28]:
# Store results as dataframe
xgb_dict = {'Model':'XGBoost Regressor',
          'R^2':metrics.r2_score(y_test, y_pred_test),
          'Adjusted R^2':(1 - (1-metrics.r2_score(y_test, y_pred_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)),
          'MAE':metrics.mean_absolute_error(y_test, y_pred_test),
          'MSE':metrics.mean_squared_error(y_test, y_pred_test),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))}

xgb_metrics = pd.DataFrame.from_dict(xgb_dict, orient = 'index').T
xgb_metrics

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,XGBoost Regressor,0.939152,0.937703,0.0816267,0.0174303,0.132024


Metrics of XGBoost Regressor (ALL features): 0.938933,	0.937171,	0.0845944,	0.0174931,	0.132261

Similar performance with or without the amenities features

#### Feature Importance

In [29]:
xgb_features = pd.DataFrame([X_train.columns, best_xgb_reg.feature_importances_]).T
xgb_features = xgb_features.rename(columns={0: 'Feature', 1: 'Importance Score'})
xgb_features.sort_values(by = 'Importance Score', ascending = False).head(10)

Unnamed: 0,Feature,Importance Score
13,district_D10,0.266733
0,sqft,0.163545
7,district_D04,0.159607
12,district_D09,0.0518453
2,travel_time_orchard,0.0495779
27,district_D25,0.0360125
5,district_D02,0.0285344
3,travel_time_raffles,0.0260615
1,travel_time_changi,0.0187806
11,district_D08,0.0175022


#### Model with just the numeric features (i.e. sqft, travel times)

In [37]:
X_train_sm = X_train[['sqft','travel_time_changi','travel_time_orchard','travel_time_raffles']]
X_test_sm = X_test[['sqft','travel_time_changi','travel_time_orchard','travel_time_raffles']]

In [38]:
xgb_reg = xgb.XGBRegressor(seed = 42, objective ='reg:squarederror')

xgb_reg_grid = GridSearchCV(estimator = xgb_reg, param_grid = xgb_param_grid, cv=5, n_jobs=-1, verbose=2) 

xgb_reg_grid.fit(X_train_sm, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  2.3min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, seed=42,
                                    subsample=None, tree_method=None,
      

In [39]:
best_xgb_reg = xgb_reg_grid.best_estimator_
y_pred_test = best_xgb_reg.predict(X_test_sm)

In [41]:
# Store results as dataframe
xgb_dict = {'Model':'XGBoost Regressor',
          'R^2':metrics.r2_score(y_test, y_pred_test),
          'Adjusted R^2':(1 - (1-metrics.r2_score(y_test, y_pred_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)),
          'MAE':metrics.mean_absolute_error(y_test, y_pred_test),
          'MSE':metrics.mean_squared_error(y_test, y_pred_test),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))}

xgb_metrics_sm = pd.DataFrame.from_dict(xgb_dict, orient = 'index').T
xgb_metrics_sm

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,XGBoost Regressor,0.934522,0.932963,0.0837601,0.0187566,0.136955


With just the numerical features, the RMSE worsened slightly from 0.132024 to 0.136955. Meaning that the inclusion of features like district and lease features helped with the performance.

___
<a name="section-3"></a>
## (iii) Modeling - LightGBM Regressor

In [16]:
condo_df = condo_df_sm.copy()

# One-hot encoding of categorical variables
condo_df = pd.get_dummies(condo_df, columns = ['district'], drop_first = False) 
condo_df = pd.get_dummies(condo_df, columns = ['lease'], drop_first = False) 

# Separate dependent(target) variable and independent variables
X = condo_df.drop(['rental_log'], axis = 1)
y = condo_df['rental_log']

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [17]:
gbm_param_grid = {'boosting_type': ['gbdt'],
                  'objective': ['regression'],
                  'metric': ['rmse'],
                  'max_depth': [4,6,8,9,10,11,12], 
                  'learning_rate': [0.001,0.01,0.05,0.1,0.2]}

In [18]:
light_gbm_reg = lgb.LGBMRegressor(seed = 42, num_iterations = 1000)

light_gbm_grid = GridSearchCV(estimator = light_gbm_reg, param_grid = gbm_param_grid, cv=5, n_jobs=-1, verbose=1) 

light_gbm_grid.fit(X_train, y_train)

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 175 out of 175 | elapsed:  2.5min finished




GridSearchCV(cv=5, estimator=LGBMRegressor(num_iterations=1000, seed=42),
             n_jobs=-1,
             param_grid={'boosting_type': ['gbdt'],
                         'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
                         'max_depth': [4, 6, 8, 9, 10, 11, 12],
                         'metric': ['rmse'], 'objective': ['regression']},
             verbose=1)

In [19]:
light_gbm_grid.best_params_

{'boosting_type': 'gbdt',
 'learning_rate': 0.1,
 'max_depth': 12,
 'metric': 'rmse',
 'objective': 'regression'}

In [20]:
best_light_gbm = light_gbm_grid.best_estimator_
y_pred_test = best_light_gbm.predict(X_test)

In [21]:
# Store results as dataframe
lgbm_dict = {'Model':'LightGBM Regressor',
          'R^2':metrics.r2_score(y_test, y_pred_test),
          'Adjusted R^2':(1 - (1-metrics.r2_score(y_test, y_pred_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)),
          'MAE':metrics.mean_absolute_error(y_test, y_pred_test),
          'MSE':metrics.mean_squared_error(y_test, y_pred_test),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))}

lgbm_metrics = pd.DataFrame.from_dict(lgbm_dict, orient = 'index').T
lgbm_metrics

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,LightGBM Regressor,0.933048,0.931454,0.0859635,0.0191789,0.138488


Metrics of LightGBM  Regressor (Full Features): 0.935645,	0.933788,	0.0852931,	0.018435,	0.135776

#### LightGBM Regressor Feature Importance

In [24]:
lgb_features = pd.DataFrame([X_train.columns, best_light_gbm.feature_importances_]).T
lgb_features = lgb_features.rename(columns={0: 'Feature', 1: 'Importance Score'})
lgb_features.sort_values(by = 'Importance Score', ascending = False).head(10)

Unnamed: 0,Feature,Importance Score
0,sqft,11755
1,travel_time_changi,5483
2,travel_time_orchard,4806
3,travel_time_raffles,4683
31,lease_Flexible,403
12,district_D09,286
32,lease_greater_than_or_equal_24m,257
13,district_D10,173
7,district_D04,153
14,district_D11,137


#### Model with just the numeric features (i.e. sqft, travel times)

In [33]:
X_train_sm = X_train[['sqft','travel_time_changi','travel_time_orchard','travel_time_raffles']]
X_test_sm = X_test[['sqft','travel_time_changi','travel_time_orchard','travel_time_raffles']]

In [31]:
light_gbm_reg = lgb.LGBMRegressor(seed = 42, num_iterations = 1000)

light_gbm_grid = GridSearchCV(estimator = light_gbm_reg, param_grid = gbm_param_grid, cv=5, n_jobs=-1, verbose=1) 

light_gbm_grid.fit(X_train_sm, y_train)

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 175 out of 175 | elapsed:  2.2min finished




GridSearchCV(cv=5, estimator=LGBMRegressor(num_iterations=1000, seed=42),
             n_jobs=-1,
             param_grid={'boosting_type': ['gbdt'],
                         'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
                         'max_depth': [4, 6, 8, 9, 10, 11, 12],
                         'metric': ['rmse'], 'objective': ['regression']},
             verbose=1)

In [42]:
light_gbm_grid.best_params_

{'boosting_type': 'gbdt',
 'learning_rate': 0.1,
 'max_depth': 11,
 'metric': 'rmse',
 'objective': 'regression'}

In [35]:
best_light_gbm = light_gbm_grid.best_estimator_
y_pred_test = best_light_gbm.predict(X_test_sm)

In [36]:
# Store results as dataframe
lgbm_dict = {'Model':'LightGBM Regressor',
          'R^2':metrics.r2_score(y_test, y_pred_test),
          'Adjusted R^2':(1 - (1-metrics.r2_score(y_test, y_pred_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)),
          'MAE':metrics.mean_absolute_error(y_test, y_pred_test),
          'MSE':metrics.mean_squared_error(y_test, y_pred_test),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))}

lgbm_metrics_sm = pd.DataFrame.from_dict(lgbm_dict, orient = 'index').T
lgbm_metrics_sm

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,LightGBM Regressor,0.934002,0.93243,0.084159,0.0189056,0.137498


___
### Model Comparison

In [43]:
df_metrics = pd.concat([xgb_metrics_sm, lgbm_metrics_sm])
df_metrics.sort_values(by = 'RMSE', ascending = True)

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,XGBoost Regressor,0.934522,0.932963,0.0837601,0.0187566,0.136955
0,LightGBM Regressor,0.934002,0.93243,0.084159,0.0189056,0.137498


Models with full features (in Part 3 notebook) still performs better