
# Random Forest Regression


In [80]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

## E.4(a) - Listing data without Feature Selection

In [81]:
# IMPORT LISTING DATA
listings1 = pd.read_csv('a.csv')
listings1.head()

Unnamed: 0.1,Unnamed: 0,id,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,...,price,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,0,49091,100,0,2.0,1.0,1.0,1.44255,103.7958,1,...,4.406719,180,360,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,1,50646,0,0,1.0,1.0,1.0,1.33235,103.78521,2,...,4.382027,90,730,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,2,56334,100,0,2.0,1.0,1.0,1.44246,103.79667,1,...,4.219508,6,14,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,3,71609,100,0,8.0,1.0,1.0,1.34541,103.95712,6,...,5.187386,90,1125,1,20,1,1.242553,0.849882,0.446319,1.684951
4,4,71896,100,0,8.0,1.0,1.0,1.34567,103.95963,3,...,4.553877,90,1125,1,24,1,0.75841,0.849882,0.446319,1.684951


In [82]:
# DROP UNNECCESSARY COLUMNS
listings1 = listings1.drop(columns='Unnamed: 0')
listings1 = listings1.drop(columns='id')
listings1.head()

Unnamed: 0,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bathrooms_text,bedrooms,...,price,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,100,0,2.0,1.0,1.0,1.44255,103.7958,1,1.0,1.0,...,4.406719,180,360,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,0,0,1.0,1.0,1.0,1.33235,103.78521,2,1.0,1.0,...,4.382027,90,730,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,100,0,2.0,1.0,1.0,1.44246,103.79667,1,1.0,1.0,...,4.219508,6,14,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,100,0,8.0,1.0,1.0,1.34541,103.95712,6,1.0,2.0,...,5.187386,90,1125,1,20,1,1.242553,0.849882,0.446319,1.684951
4,100,0,8.0,1.0,1.0,1.34567,103.95963,3,0.5,1.0,...,4.553877,90,1125,1,24,1,0.75841,0.849882,0.446319,1.684951


In [83]:
X = listings1.drop('price',axis=1)
y = listings1['price']

In [84]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [85]:
# Create the parameter grid for GridSearchCV
rf_param_grid = {
    'max_depth': [80, 90, 100], # Maximum number of levels in each decision tree
    'max_features': [2, 3], # Maximum number of features considered for splitting a node
    'min_samples_leaf': [1, 3, 4, 5], # Minimum number of data points allowed in a leaf node
    'n_estimators': [100, 300, 600] # Number of trees in the forest
}

In [86]:
# Instantiate a Random Forest Regressor
rf_reg = RandomForestRegressor(random_state = 42)

In [87]:
# Setup grid search
rf_grid = GridSearchCV(estimator = rf_reg, param_grid = rf_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [88]:
# Fit the grid search model on training dataset
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  3.7min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [80, 90, 100], 'max_features': [2, 3],
                         'min_samples_leaf': [1, 3, 4, 5],
                         'n_estimators': [100, 300, 600]},
             verbose=2)

In [89]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
rf_best = rf_grid.best_estimator_
y_pred = rf_best.predict(X_test)

In [90]:
scores = cross_val_score(rf_best, X_train, y_train, cv=5)

In [91]:
# Calculate performance metrics
rf_dict1 = {'Model':'Random Forest Regressor (a)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
          'CVS':scores.mean()
          }


In [131]:
# Display model performance metrics
rf_reg_metrics1 = pd.DataFrame.from_dict(rf_dict1, orient = 'index').T
rf_reg_metrics1

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Random Forest Regressor (a),0.770306,0.246739,0.156828,0.396016,0.741533


## E.4(b) - Listing data with Feature Selection

In [93]:
# IMPORT LISTING DATA
listings2 = pd.read_csv('b.csv')
listings2.head()

Unnamed: 0.1,Unnamed: 0,price,host_response_rate,host_is_superhost,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,0,4.406719,100,0,1.44255,1,1.0,1.0,1.0,180,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,1,4.382027,0,0,1.33235,2,1.0,1.0,1.0,90,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,2,4.219508,100,0,1.44246,1,1.0,1.0,1.0,6,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,3,5.187386,100,0,1.34541,6,1.0,2.0,3.0,90,1,20,1,1.242553,0.849882,0.446319,1.684951
4,4,4.553877,100,0,1.34567,3,0.5,1.0,1.0,90,1,24,1,0.75841,0.849882,0.446319,1.684951


In [94]:
# DROP UNNECCESSARY COLUMNS
listings2 = listings2.drop(columns='Unnamed: 0')
listings2.head()

Unnamed: 0,price,host_response_rate,host_is_superhost,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,4.406719,100,0,1.44255,1,1.0,1.0,1.0,180,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,4.382027,0,0,1.33235,2,1.0,1.0,1.0,90,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,4.219508,100,0,1.44246,1,1.0,1.0,1.0,6,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,5.187386,100,0,1.34541,6,1.0,2.0,3.0,90,1,20,1,1.242553,0.849882,0.446319,1.684951
4,4.553877,100,0,1.34567,3,0.5,1.0,1.0,90,1,24,1,0.75841,0.849882,0.446319,1.684951


In [95]:
X = listings2.drop('price',axis=1)
y = listings2['price']

In [96]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [97]:
# Create the parameter grid for GridSearchCV
rf_param_grid = {
    'max_depth': [80, 90, 100], # Maximum number of levels in each decision tree
    'max_features': [2, 3], # Maximum number of features considered for splitting a node
    'min_samples_leaf': [1, 3, 4, 5], # Minimum number of data points allowed in a leaf node
    'n_estimators': [100, 300, 600] # Number of trees in the forest
}

In [98]:
# Instantiate a Random Forest Regressor
rf_reg = RandomForestRegressor(random_state = 42)

In [99]:
# Setup grid search
rf_grid = GridSearchCV(estimator = rf_reg, param_grid = rf_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [100]:
# Fit the grid search model on training dataset
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  3.5min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [80, 90, 100], 'max_features': [2, 3],
                         'min_samples_leaf': [1, 3, 4, 5],
                         'n_estimators': [100, 300, 600]},
             verbose=2)

In [101]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
rf_best = rf_grid.best_estimator_
y_pred = rf_best.predict(X_test)

In [102]:
scores = cross_val_score(rf_best, X_train, y_train, cv=5)

In [103]:
# Calculate performance metrics
rf_dict2 = {'Model':'Random Forest Regressor (b)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
          'CVS':scores.mean()
          }


In [134]:
# Display model performance metrics
rf_reg_metrics2 = pd.DataFrame.from_dict(rf_dict2, orient = 'index').T
rf_reg_metrics2

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Random Forest Regressor (b),0.758289,0.255308,0.165034,0.406243,0.736499



## E.4(c) -  Listing & Review data without Feature Selection


In [105]:
# IMPORT LISTING DATA
listings3 = pd.read_csv('c.csv')
listings3.head()

Unnamed: 0.1,Unnamed: 0,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bathrooms_text,...,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,0,100.0,0.0,2.0,1.0,1.0,1.44255,103.7958,1.0,1.0,...,180.0,360.0,1.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,1,0.0,0.0,1.0,1.0,1.0,1.33235,103.78521,2.0,1.0,...,90.0,730.0,1.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,2,100.0,0.0,2.0,1.0,1.0,1.44246,103.79667,1.0,1.0,...,6.0,14.0,1.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,3,100.0,0.0,8.0,1.0,1.0,1.34541,103.95712,6.0,1.0,...,90.0,1125.0,1.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,4,100.0,0.0,8.0,1.0,1.0,1.34567,103.95963,3.0,0.5,...,90.0,1125.0,1.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [106]:
# DROP UNNECCESSARY COLUMNS
listings3 = listings3.drop(columns='Unnamed: 0')
listings3.head()

Unnamed: 0,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bathrooms_text,bedrooms,...,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,100.0,0.0,2.0,1.0,1.0,1.44255,103.7958,1.0,1.0,1.0,...,180.0,360.0,1.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,0.0,0.0,1.0,1.0,1.0,1.33235,103.78521,2.0,1.0,1.0,...,90.0,730.0,1.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,100.0,0.0,2.0,1.0,1.0,1.44246,103.79667,1.0,1.0,1.0,...,6.0,14.0,1.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,100.0,0.0,8.0,1.0,1.0,1.34541,103.95712,6.0,1.0,2.0,...,90.0,1125.0,1.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,100.0,0.0,8.0,1.0,1.0,1.34567,103.95963,3.0,0.5,1.0,...,90.0,1125.0,1.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [107]:
X = listings3.drop('price',axis=1)
y = listings3['price']

In [108]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [109]:
# Create the parameter grid for GridSearchCV
rf_param_grid = {
    'max_depth': [80, 90, 100], # Maximum number of levels in each decision tree
    'max_features': [2, 3], # Maximum number of features considered for splitting a node
    'min_samples_leaf': [1, 3, 4, 5], # Minimum number of data points allowed in a leaf node
    'n_estimators': [100, 300, 600] # Number of trees in the forest
}

In [110]:
# Instantiate a Random Forest Regressor
rf_reg = RandomForestRegressor(random_state = 42)

In [111]:
# Setup grid search
rf_grid = GridSearchCV(estimator = rf_reg, param_grid = rf_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [112]:
# Fit the grid search model on training dataset
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   59.9s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.4min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [80, 90, 100], 'max_features': [2, 3],
                         'min_samples_leaf': [1, 3, 4, 5],
                         'n_estimators': [100, 300, 600]},
             verbose=2)

In [113]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
rf_best = rf_grid.best_estimator_
y_pred = rf_best.predict(X_test)

In [114]:
scores = cross_val_score(rf_best, X_train, y_train, cv=5)

In [115]:
# Calculate performance metrics
rf_dict3 = {'Model':'Random Forest Regressor (c)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
          'CVS':scores.mean()
          }



In [137]:
# Display model performance metrics
rf_reg_metrics3 = pd.DataFrame.from_dict(rf_dict3, orient = 'index').T
rf_reg_metrics3

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Random Forest Regressor (c),0.758659,0.252453,0.16599,0.407419,0.75051



## E.4(d) -  Listing & Review data with Feature Selection


In [117]:
# IMPORT LISTING DATA
listings4 = pd.read_csv('d.csv')
listings4.head()

Unnamed: 0.1,Unnamed: 0,price,host_response_rate,host_is_superhost,host_total_listings_count,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,0,4.406719,100.0,0.0,2.0,1.44255,1.0,1.0,1.0,1.0,180.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,1,4.382027,0.0,0.0,1.0,1.33235,2.0,1.0,1.0,1.0,90.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,2,4.219508,100.0,0.0,2.0,1.44246,1.0,1.0,1.0,1.0,6.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,3,5.187386,100.0,0.0,8.0,1.34541,6.0,1.0,2.0,3.0,90.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,4,4.553877,100.0,0.0,8.0,1.34567,3.0,0.5,1.0,1.0,90.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [118]:
# DROP UNNECCESSARY COLUMNS
listings4 = listings4.drop(columns='Unnamed: 0')
listings4.head()

Unnamed: 0,price,host_response_rate,host_is_superhost,host_total_listings_count,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,4.406719,100.0,0.0,2.0,1.44255,1.0,1.0,1.0,1.0,180.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,4.382027,0.0,0.0,1.0,1.33235,2.0,1.0,1.0,1.0,90.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,4.219508,100.0,0.0,2.0,1.44246,1.0,1.0,1.0,1.0,6.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,5.187386,100.0,0.0,8.0,1.34541,6.0,1.0,2.0,3.0,90.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,4.553877,100.0,0.0,8.0,1.34567,3.0,0.5,1.0,1.0,90.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [119]:
X = listings4.drop('price',axis=1)
y = listings4['price']

In [120]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [121]:
# Create the parameter grid for GridSearchCV
rf_param_grid = {
    'max_depth': [80, 90, 100], # Maximum number of levels in each decision tree
    'max_features': [2, 3], # Maximum number of features considered for splitting a node
    'min_samples_leaf': [1, 3, 4, 5], # Minimum number of data points allowed in a leaf node
    'n_estimators': [100, 300, 600] # Number of trees in the forest
}

In [122]:
# Instantiate a Random Forest Regressor
rf_reg = RandomForestRegressor(random_state = 42)

In [123]:
# Setup grid search
rf_grid = GridSearchCV(estimator = rf_reg, param_grid = rf_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [124]:
# Fit the grid search model on training dataset
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.4min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [80, 90, 100], 'max_features': [2, 3],
                         'min_samples_leaf': [1, 3, 4, 5],
                         'n_estimators': [100, 300, 600]},
             verbose=2)

In [125]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
rf_best = rf_grid.best_estimator_
y_pred = rf_best.predict(X_test)

In [126]:
scores = cross_val_score(rf_best, X_train, y_train, cv=5)

In [127]:
# Calculate performance metrics
rf_dict4 = {'Model':'Random Forest Regressor (d)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred_test),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred_test)),
          'CVS':scores.mean()
          }


In [140]:
# Display model performance metrics
rf_reg_metrics4 = pd.DataFrame.from_dict(rf_dict4, orient = 'index').T
rf_reg_metrics4

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Random Forest Regressor (d),0.758123,0.252515,0.166359,0.407871,0.751426
