# Part 3C - Rental Price Prediction (Remove Several Amenities)
 
### Content  
- [Section 1 - Data Pre-processing](#section-1)  
- [Section 2 - Modeling - XGBoost Regressor](#section-2)  
- [Section 3 - Modeling - LightGBM Regressor](#section-3)  

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [12]:
import warnings
warnings.filterwarnings("ignore")

___
<a name="section-1"></a>
## (i) Data Pre-Processing

In [13]:
condo_df = pd.read_csv('./Data/condo_dataset_preprocessed.csv')

#### Drop unnecessary columns

In [14]:
condo_df.columns

Index(['amenities', 'beds', 'developer', 'district', 'electoral_div', 'lease',
       'name', 'nearest_mrt_name', 'neighbourhood', 'property_type', 'rental',
       'sqft', 'tenure', 'travel_time_changi', 'travel_time_orchard',
       'travel_time_raffles', 'furnishing', 'amenities_count',
       'nearest_mrt_dist_min', 'nearest_mrt_dist_metres', 'freehold_status',
       'rental_log'],
      dtype='object')

In [15]:
cols_to_drop = ['name','beds','developer','electoral_div','neighbourhood','rental','nearest_mrt_name','property_type',
               'amenities_count','nearest_mrt_dist_min','nearest_mrt_dist_metres','furnishing','tenure','freehold_status']

In [16]:
condo_df_sm = condo_df.drop(columns = cols_to_drop)
condo_df_sm.head()

Unnamed: 0,amenities,district,lease,sqft,travel_time_changi,travel_time_orchard,travel_time_raffles,rental_log
0,"'Clubhouse', 'Gym', 'Spa Pool', 'Swimming Pool...",D09,Flexible,2100,53,12,16,9.159047
1,"'Aircon', 'Gym', 'Swimming Pool', 'Wading Pool...",D09,Flexible,1399,56,12,13,8.853665
2,"'Bathtub', 'Aircon', 'Maid Room', 'Gym', 'Park...",D05,greater_than_or_equal_24m,1948,70,27,22,8.732305
3,"'Jogging Track', 'Multi-purpose Hall', 'Hammoc...",D22,Flexible,775,85,46,39,7.937375
4,"'Security', 'Jacuzzi', 'Lap Pool', 'Tennis Cou...",D15,Flexible,635,46,41,34,7.783224


#### Generate amenities of interest

In [17]:
list_of_amenities = []
for i in condo_df_sm['amenities']:
    for j in i.split(', '):
        list_of_amenities.append(j)
list_of_amenities = [x.replace("'","") for x in list_of_amenities]

In [18]:
# Value counts of amenities
amenities_df = pd.DataFrame.from_dict(Counter(list_of_amenities), orient='index').reset_index()
amenities_df.rename(columns={'index':'amenities', 0:'count'}, inplace = True)
amenities_df['percentage'] = round(100 * (amenities_df['count'] / condo_df_sm['rental_log'].describe()[0]),1)
amenities_df.sort_values(['count'], ascending = False).set_index('amenities')

Unnamed: 0_level_0,count,percentage
amenities,Unnamed: 1_level_1,Unnamed: 2_level_1
Gym,5661,77.4
Security,5419,74.1
Parking,5187,70.9
BBQ,4944,67.6
Swimming Pool,4590,62.8
Playground,4165,57.0
Jacuzzi,3663,50.1
Aircon,3580,49.0
Clubhouse,3483,47.6
Wading Pool,3396,46.4


In [19]:
# Identify the amenities that will likely impact rental price (based on own assumptions and business understanding)
# Remove Balcony, Sauna, Steam Room
# Include High Ceiling
key_amenities = ['High Floor','Renovated','City View','Greenery View', 'High Ceiling', 'Sauna', 'Steam Room']

In [20]:
# Create one-hot encoded column for each amenity
condo_df_with_amenities = condo_df_sm.copy()

for amenity in key_amenities:
    amenity_name = amenity.lower().replace(' ','_')
    condo_df_sm[f'{amenity_name}'] = condo_df_sm['amenities'].str.contains(f'{amenity}')*1 # Multiply by 1 to convert boolean into integer

In [21]:
# Drop original amenities column
condo_df_sm.drop(columns = ['amenities'], inplace = True)

___
<a name="section-2"></a>
## (ii) Modeling - XGBoost Regressor

In [22]:
condo_df = condo_df_sm.copy()

# One-hot encoding of categorical variables
condo_df = pd.get_dummies(condo_df, columns = ['district'], drop_first = False) 
condo_df = pd.get_dummies(condo_df, columns = ['lease'], drop_first = False) 

# Separate dependent(target) variable and independent variables
X = condo_df.drop(['rental_log'], axis = 1)
y = condo_df['rental_log']

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [23]:
xgb_param_grid = {'colsample_bytree': [0.4,0.5,0.6,0.7,0.8],
                  'learning_rate' : [0.01,0.1,0.3],
                  'max_depth': [3,4,5,6], 
                  'alpha': [1e-5, 1e-2, 0.1, 1, 100], 
                  'n_estimators':[200]
}

In [24]:
xgb_reg = xgb.XGBRegressor(seed = 42, objective ='reg:squarederror')

xgb_reg_grid = GridSearchCV(estimator = xgb_reg, param_grid = xgb_param_grid, cv=5, n_jobs=-1, verbose=1) 

xgb_reg_grid.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  5.6min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, seed=42,
                                    subsample=None, tree_method=None,
      

In [25]:
xgb_reg_grid.best_params_

{'alpha': 0.1,
 'colsample_bytree': 0.8,
 'learning_rate': 0.3,
 'max_depth': 6,
 'n_estimators': 200}

In [26]:
best_xgb_reg = xgb_reg_grid.best_estimator_
y_pred_test = best_xgb_reg.predict(X_test)

In [27]:
# Store results as dataframe
xgb_dict = {'Model':'XGBoost Regressor',
          'R^2':metrics.r2_score(y_test, y_pred_test),
          'Adjusted R^2':(1 - (1-metrics.r2_score(y_test, y_pred_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)),
          'MAE':metrics.mean_absolute_error(y_test, y_pred_test),
          'MSE':metrics.mean_squared_error(y_test, y_pred_test),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))}

xgb_metrics = pd.DataFrame.from_dict(xgb_dict, orient = 'index').T
xgb_metrics

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,XGBoost Regressor,0.937192,0.93538,0.0841628,0.0179917,0.134133


#### Feature Importance

In [28]:
xgb_features = pd.DataFrame([X_train.columns, best_xgb_reg.feature_importances_]).T
xgb_features = xgb_features.rename(columns={0: 'Feature', 1: 'Importance Score'})
xgb_features.sort_values(by = 'Importance Score', ascending = False).head(15)

Unnamed: 0,Feature,Importance Score
0,sqft,0.320054
14,district_D04,0.0837194
2,travel_time_orchard,0.061716
30,district_D20,0.0457846
3,travel_time_raffles,0.043607
20,district_D10,0.0423329
12,district_D02,0.0370436
16,district_D06,0.0286189
17,district_D07,0.0279514
19,district_D09,0.0272155


___
<a name="section-3"></a>
## (iii) Modeling - LightGBM Regressor

In [29]:
condo_df = condo_df_sm.copy()

# One-hot encoding of categorical variables
condo_df = pd.get_dummies(condo_df, columns = ['district'], drop_first = False) 
condo_df = pd.get_dummies(condo_df, columns = ['lease'], drop_first = False) 

# Separate dependent(target) variable and independent variables
X = condo_df.drop(['rental_log'], axis = 1)
y = condo_df['rental_log']

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [30]:
gbm_param_grid = {'boosting_type': ['gbdt'],
                  'objective': ['regression'],
                  'metric': ['rmse'],
                  'max_depth': [4,6,8,9,10,11,12], 
                  'learning_rate': [0.001,0.01,0.05,0.1,0.2]}

In [31]:
light_gbm_reg = lgb.LGBMRegressor(seed = 42, num_iterations = 1000)

light_gbm_grid = GridSearchCV(estimator = light_gbm_reg, param_grid = gbm_param_grid, cv=5, n_jobs=-1, verbose=1) 

light_gbm_grid.fit(X_train, y_train)

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   44.4s
[Parallel(n_jobs=-1)]: Done 175 out of 175 | elapsed:  2.9min finished




GridSearchCV(cv=5, estimator=LGBMRegressor(num_iterations=1000, seed=42),
             n_jobs=-1,
             param_grid={'boosting_type': ['gbdt'],
                         'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
                         'max_depth': [4, 6, 8, 9, 10, 11, 12],
                         'metric': ['rmse'], 'objective': ['regression']},
             verbose=1)

In [32]:
light_gbm_grid.best_params_

{'boosting_type': 'gbdt',
 'learning_rate': 0.1,
 'max_depth': 11,
 'metric': 'rmse',
 'objective': 'regression'}

In [33]:
best_light_gbm = light_gbm_grid.best_estimator_
y_pred_test = best_light_gbm.predict(X_test)

In [34]:
# Store results as dataframe
lgbm_dict = {'Model':'LightGBM Regressor',
          'R^2':metrics.r2_score(y_test, y_pred_test),
          'Adjusted R^2':(1 - (1-metrics.r2_score(y_test, y_pred_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)),
          'MAE':metrics.mean_absolute_error(y_test, y_pred_test),
          'MSE':metrics.mean_squared_error(y_test, y_pred_test),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))}

lgbm_metrics = pd.DataFrame.from_dict(lgbm_dict, orient = 'index').T
lgbm_metrics

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,LightGBM Regressor,0.934388,0.932495,0.0860068,0.0187948,0.137094


#### LightGBM Regressor Feature Importance

In [35]:
lgb_features = pd.DataFrame([X_train.columns, best_light_gbm.feature_importances_]).T
lgb_features = lgb_features.rename(columns={0: 'Feature', 1: 'Importance Score'})
lgb_features.sort_values(by = 'Importance Score', ascending = False).head(15)

Unnamed: 0,Feature,Importance Score
0,sqft,10492
1,travel_time_changi,4917
2,travel_time_orchard,4388
3,travel_time_raffles,4091
10,steam_room,549
9,sauna,499
38,lease_Flexible,408
4,high_floor,385
5,renovated,347
6,city_view,312


### Model Comparison

In [36]:
df_metrics = pd.concat([xgb_metrics, lgbm_metrics])
df_metrics.sort_values(by = 'RMSE', ascending = True)

Unnamed: 0,Model,R^2,Adjusted R^2,MAE,MSE,RMSE
0,XGBoost Regressor,0.937192,0.93538,0.0841628,0.0179917,0.134133
0,LightGBM Regressor,0.934388,0.932495,0.0860068,0.0187948,0.137094


Models with full features (in Part 3 notebook) still performs better