# Modeling Kostas

## Import data

In [2]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

sys.path.insert(1, 'C:\\Users\\Administrator\\Documents\\Python Code\\Big Data & AI Academy\\Live Training\\Exercises\\Project\\src')
from preprocessing import run_preprocessing_pipeline


cwd = os.getcwd()
path = Path(cwd)
path = str(path.parent.absolute())

df_initial = pd.read_csv(path + '\data\\listings.csv')

In [3]:
df = run_preprocessing_pipeline(df_initial)
df.head()

Unnamed: 0,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,...,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,instant_bookable,reviews_per_month,target
0,2009,0,0.0,1,6.0,1,1,329,37.98863,23.76527,...,19,49,79,170,32,7,0,1,0.41,79.0
1,2009,0,0.0,1,6.0,1,1,329,37.98903,23.76448,...,26,56,86,361,52,12,1,1,0.72,50.0
2,2009,0,0.0,1,6.0,1,1,329,37.98888,23.76473,...,15,26,56,331,71,19,3,1,0.97,38.0
3,2009,0,0.0,1,6.0,1,1,329,37.98903,23.76448,...,22,52,82,357,24,1,0,1,0.33,48.0
4,2009,0,0.0,1,6.0,1,1,329,37.98924,23.765,...,0,27,57,208,17,0,0,1,0.23,47.0


## Split data into training and test set

In [3]:
# Split into features and target.
X = df.drop('target', axis = 1, inplace = False)
y = df.target

# Split into train and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Remove outliers from training set.
price_threshold = 200

X_train.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)

indices = []
for i in range(len(y_train)):
    if y_train[i] <= price_threshold:
        indices.append(i)

X_train = X_train.iloc[indices, :]
y_train = y_train.iloc[indices]

## Decision Tree

In [5]:
from sklearn.tree import DecisionTreeRegressor

# Create and fit model.
model_dt = DecisionTreeRegressor(criterion = 'absolute_error')

scores = cross_val_score(model_dt, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 7)
print(f'Average Cross-Validation score MAE: {-1 * scores.mean()}')

scores = cross_val_score(model_dt, X_train, y_train, scoring = 'neg_mean_absolute_percentage_error', cv = 7)
print(f'Average Cross-Validation score MAPE: {-1 * scores.mean()}')

model_dt.fit(X_train, y_train)

# Get predictions on test set.
y_pred_dt = model_dt.predict(X_test)

# Calculate MAE and MAPE.
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mape_dt = mean_absolute_percentage_error(y_test, y_pred_dt)

print(f'MAE: {mae_dt : .2f}')
print(f'MAPE: {mape_dt : .2f}')

Average Cross-Validation score MAE: 24.04077027873194
Average Cross-Validation score MAPE: 0.432500154297251
MAE:  38.19
MAPE:  0.47


## Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

# Create and fit model.
model_lr = LinearRegression()

scores = cross_val_score(model_lr, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 7)
print(f'Average Cross-Validation score MAE: {-1 * scores.mean()}')

scores = cross_val_score(model_lr, X_train, y_train, scoring = 'neg_mean_absolute_percentage_error', cv = 7)
print(f'Average Cross-Validation score MAPE: {-1 * scores.mean()}')

model_lr.fit(X_train, y_train)

# Get predictions on test set.
y_pred_lr = model_lr.predict(X_test)

# Calculate MAE and MAPE.
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mape_lr = mean_absolute_percentage_error(y_test, y_pred_lr)

print(f'MAE: {mae_lr : .2f}')
print(f'MAPE: {mape_lr : .2f}')

Average Cross-Validation score MAE: 27.532593632963493
Average Cross-Validation score MAPE: 0.5530331282130986
MAE:  35.32
MAPE:  0.46


## Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Create and fit model.
model_rf = RandomForestRegressor()
scores = cross_val_score(model_rf, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 7)
print(f'Average Cross-Validation score: {-1 * scores.mean()}')

scores = cross_val_score(model_rf, X_train, y_train, scoring = 'neg_mean_absolute_percentage_error', cv = 7)
print(f'Average Cross-Validation score MAPE: {-1 * scores.mean()}')

model_rf.fit(X_train, y_train)

# Get predictions on test set.
y_pred_rf = model_rf.predict(X_test)

# Calculate MAE and MAPE.
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mape_rf = mean_absolute_percentage_error(y_test, y_pred_rf)

print(f'MAE: {mae_rf : .2f}')
print(f'MAPE: {mape_rf : .2f}')

Average Cross-Validation score: 17.203669839879115
Average Cross-Validation score MAPE: 0.3281736493994804
MAE:  31.55
MAPE:  0.36


## XGBoost

In [7]:
from xgboost import XGBRegressor

# Create and fit model.
model_xgb = XGBRegressor()

scores = cross_val_score(model_xgb, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 7)
print(f'Average Cross-Validation score: {-1 * scores.mean()}')

scores = cross_val_score(model_xgb, X_train, y_train, scoring = 'neg_mean_absolute_percentage_error', cv = 7)
print(f'Average Cross-Validation score MAPE: {-1 * scores.mean()}')

model_xgb.fit(X_train, y_train)

# Get predictions on test set.
y_pred_xgb = model_xgb.predict(X_test)

# Calculate MAE and MAPE.
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)

print(f'MAE: {mae_xgb : .2f}')
print(f'MAPE: {mape_xgb : .2f}')

Average Cross-Validation score: 17.264341968539718
Average Cross-Validation score MAPE: 0.32658132569308357
MAE:  30.83
MAPE:  0.34


### Parameter Tuning

Use Grid Search to tune the model

1. Find optimal n_estimators.

In [17]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : range(100, 650, 50), 'learning_rate' : [0.1], 'max_depth' : [5], 'min_child_weight' : [1], 'gamma' : [0], 
              'subsample' : [0.8], 'colsample_bytree' : [0.8], 'n_jobs' : [4], 'scale_pos_weight' : [1]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error')

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    val

In [19]:
print(f'Best number of estimators: {grid.best_estimator_.n_estimators}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best number of estimators: 300
Best MAE: 16.934506169407275


2. Find optimal max_depth and min_child_weight.

In [20]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : [300], 'learning_rate' : [0.1], 'max_depth' : range(3, 10), 'min_child_weight' : range(1, 6), 'gamma' : [0], 
              'subsample' : [0.8], 'colsample_bytree' : [0.8], 'n_jobs' : [4], 'scale_pos_weight' : [1]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
       

In [21]:
print(f'Best max depth: {grid.best_estimator_.max_depth}')
print(f'Best min child weight: {grid.best_estimator_.min_child_weight}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best max depth: 7
Best min child weight: 2
Best MAE: 16.7415625575158


3. Find optimal gamma.

In [24]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : [300], 'learning_rate' : [0.1], 'max_depth' : [7], 'min_child_weight' : [2], 'gamma' : [i/10.0 for i in range(0, 6)], 
              'subsample' : [0.8], 'colsample_bytree' : [0.8], 'n_jobs' : [4], 'scale_pos_weight' : [1]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
       

In [25]:
print(f'Best gamma: {grid.best_estimator_.gamma}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best gamma: 0.5
Best MAE: 16.73508294171928


4. Recalibrate optimal n_estimators.

In [28]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : range(100, 650, 50), 'learning_rate' : [0.1], 'max_depth' : [7], 'min_child_weight' : [2], 'gamma' : [0.5], 
              'subsample' : [0.8], 'colsample_bytree' : [0.8], 'n_jobs' : [4], 'scale_pos_weight' : [1]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
       

In [29]:
print(f'Best number of estimators: {grid.best_estimator_.n_estimators}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best number of estimators: 200
Best MAE: 16.68390879699337


5. Find optimal subsample and colsample_bytree.

In [30]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : [200], 'learning_rate' : [0.1], 'max_depth' : [7], 'min_child_weight' : [2], 'gamma' : [0.5], 
              'subsample' : [i/10.0 for i in range(6,10)], 'colsample_bytree' : [i/10.0 for i in range(6,10)], 'n_jobs' : [4], 'scale_pos_weight' : [1]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
       

In [31]:
print(f'Best subsample: {grid.best_estimator_.subsample}')
print(f'Best colsample bytree: {grid.best_estimator_.colsample_bytree}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best subsample: 0.8
Best colsample bytree: 0.8
Best MAE: 16.68390879699337


In [32]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : [200], 'learning_rate' : [0.1], 'max_depth' : [7], 'min_child_weight' : [2], 'gamma' : [0.5], 
              'subsample' : [i/100.0 for i in range(75,90,5)], 'colsample_bytree' : [i/100.0 for i in range(75,90,5)], 'n_jobs' : [4], 'scale_pos_weight' : [1]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
       

In [33]:
print(f'Best subsample: {grid.best_estimator_.subsample}')
print(f'Best colsample bytree: {grid.best_estimator_.colsample_bytree}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best subsample: 0.8
Best colsample bytree: 0.8
Best MAE: 16.68390879699337


6. Find optimal reg alpha.

In [34]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : [200], 'learning_rate' : [0.1], 'max_depth' : [7], 'min_child_weight' : [2], 'gamma' : [0.5], 
              'subsample' : [0.8], 'colsample_bytree' : [0.8], 'n_jobs' : [4], 'scale_pos_weight' : [1], 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
       

In [35]:
print(f'Best reg alpha: {grid.best_estimator_.reg_alpha}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best reg alpha: 1e-05
Best MAE: 16.68390880121172


7. Finally find optimal learning rate and number of estimators.

In [37]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : range(100, 1100, 100), 'learning_rate' : [i/100.0 for i in range(1, 11)], 'max_depth' : [7], 'min_child_weight' : [2], 'gamma' : [0.5], 
              'subsample' : [0.8], 'colsample_bytree' : [0.8], 'n_jobs' : [4], 'scale_pos_weight' : [1], 'reg_alpha':[1e-5]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.8], 'gamma': [0.5],
                         'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06,
      

In [38]:
print(f'Best number of estimators: {grid.best_estimator_.n_estimators}')
print(f'Best learning rate: {grid.best_estimator_.learning_rate}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best number of estimators: 900
Best learning rate: 0.02
Best MAE: 16.45838253881044


In [39]:
model_xgb_tuned = XGBRegressor()

param_grid = {'n_estimators' : [850, 900, 950], 'learning_rate' : [0.015, 0.02, 0.025], 'max_depth' : [7], 'min_child_weight' : [2], 'gamma' : [0.5], 
              'subsample' : [0.8], 'colsample_bytree' : [0.8], 'n_jobs' : [4], 'scale_pos_weight' : [1], 'reg_alpha':[1e-5]}

grid = GridSearchCV(model_xgb_tuned, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', n_jobs = -1)

grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             n_jobs=-1,
       

In [40]:
print(f'Best number of estimators: {grid.best_estimator_.n_estimators}')
print(f'Best learning rate: {grid.best_estimator_.learning_rate}')
print(f'Best MAE: {-1 * grid.best_score_}')

Best number of estimators: 950
Best learning rate: 0.025
Best MAE: 16.44011490414669


### Final optimized model

In [41]:
grid.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0.5, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.025, max_delta_step=0,
             max_depth=7, min_child_weight=2, missing=nan,
             monotone_constraints='()', n_estimators=950, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
print(f'Final MAE: {-1 * grid.best_score_}')

Final MAE: 16.44011490414669


### Final metrics

In [8]:
# Create and fit model.
model_xgb_final = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                               colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
                               gamma=0.5, gpu_id=-1, importance_type=None,
                               interaction_constraints='', learning_rate=0.025, max_delta_step=0,
                               max_depth=7, min_child_weight=2, monotone_constraints='()', 
                               n_estimators=950, n_jobs=4, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1, 
                               subsample=0.8, tree_method='exact', validate_parameters=1, verbosity=None)

# Mean Absolute Error
scores = cross_val_score(model_xgb_final, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 10)

print(f'Final Mean Absolute Error: {-1 * scores.mean()}')

# Median Absolute Error
scores = cross_val_score(model_xgb_final, X_train, y_train, scoring = 'neg_median_absolute_error', cv = 10)

print(f'Final Median Absolute Error: {-1 * scores.mean()}')

# Mean Absolute Percentage Error
scores = cross_val_score(model_xgb_final, X_train, y_train, scoring = 'neg_mean_absolute_percentage_error', cv = 10)

print(f'Final MAPE: {-1 * scores.mean()}')

Final Mean Absolute Error: 16.00458345245672
Final Median Absolute Error: 10.792226791381836
Final MAPE: 0.29479677199929366


### Apply on  test set

In [9]:
# Fit model.
model_xgb_final.fit(X_train, y_train)

# Get predictions on test set.
y_pred_xgb = model_xgb_final.predict(X_test)

# Calculate MAE and MAPE.
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
medae_xgb = median_absolute_error(y_test, y_pred_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)

print('Metrics from applying model on test set:')
print(f'Mean Absolute Error: {mae_xgb : .2f}')
print(f'Median Absolute Error: {medae_xgb : .2f}')
print(f'Mean Absolute Percentage Error: {mape_xgb : .2f}')

Metrics from applying model on test set:
Mean Absolute Error:  29.89
Median Absolute Error:  11.34
Mean Absolute Percentage Error:  0.32


### Save model

In [9]:
from joblib import dump

dump(model_xgb_final, path + '\models\\model.joblib')

['c:\\Users\\Administrator\\Documents\\Python Code\\Big Data & AI Academy\\Live Training\\Exercises\\Project\\models\\model.joblib']

# Modeling Dimitris
This script is used to perform grid search to find the optimal values for the hyper-parameters of 4 regression algorithm models.
The models tried and their tuned hyperparameters are:
1. Linear Regression |  ---
2. Decision Tree &emsp;&nbsp;&nbsp;&nbsp;| max_depth:[5, 10, 15, 20, 40], min_samples_leaf:[1, 2, 4, 8]
3. Random Forest &emsp;| tmax_depth:[5, 10, 15, 20, 40], n_estimators:[100, 200, 400, 800]
4. XGBoost &emsp;&emsp;&emsp;&emsp;| max_depth:[5, 7, 8, 10, 15], n_estimators:[100, 200, 400, 800], learning_rate:[0.01, 0.05, 0.1, .2, .4, .8], colsample_bytree:[.5, .6, .8]

In [5]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Calling the preprocessing function on the dataset. It returns the preprocessed dataset and a boolean informing the user of any existing NAN values.
df = run_preprocessing_pipeline(df_initial)

In [None]:
# Define a single grid search function to call for each model. The outliers
# (price>200) are droped only for the training sets.
#--------------------------------------------------
# Arguements
# df:           The preprocessed dataframe.
# model_arg:    The function defining the model e.g. RandomForestRegressor().
# dict_params:  the parameters and their values to try in dictionary form.
#--------------------------------------------------
# Returns
# grid:                             The grid containing the tried models and their metrics.
# X_train, X_test, y_train, y_test  The split datasets originating from df.

def employ_grid_search(df, model_arg, dict_params):
    X = df.iloc[:,:-1].copy()
    y = df.target.copy()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Removing outliers from training set only.
    valid_indices = y_train[y_train <= 200].index
    X_train = X_train.loc[valid_indices,:]
    y_train = y_train.loc[valid_indices]

    if os.cpu_count() > 2:
        n_jobs = os.cpu_count() - 2
        print("Using",n_jobs,"threads to parallelize grid search.")
    else:
        n_jobs = 1
    grid = GridSearchCV(model_arg, dict_params, cv = 10, verbose = 4,
                        scoring = ["neg_mean_absolute_error","neg_median_absolute_error","neg_mean_absolute_percentage_error","neg_root_mean_squared_error"], refit = False,
                        n_jobs = n_jobs)
    grid.fit(X_train, y_train)

    return grid, X_train, X_test, y_train, y_test

In [None]:
#Run the grids

#Linear Regression --- No grid search is needed but included here for perspective reasons.
lin_reg_params = {"normalize":[True]}
gridLR, X_trainLR, X_testLR, y_trainLR, y_testLR = employ_grid_search(df, LinearRegression(), lin_reg_params)

#Random Forest
rf_params = {'n_estimators':[100, 200, 400, 800], 'max_depth':[5, 10, 15, 20, 40]}
gridRF, X_trainRF, X_testRF, y_trainRF, y_testRF = employ_grid_search(df, RandomForestRegressor(), rf_params)

#Decision Tree
tree_params = {'min_samples_leaf':[1, 2, 4, 8], 'max_depth':[5, 10, 15, 20, 40]}
gridDT, X_trainDT, X_testDT, y_trainDT, y_testDT = employ_grid_search(df, DecisionTreeRegressor(), tree_params)

#XGboost --- This can take some time to run due to multiple parameters tried. Uncomment if you want to run it.
xgb_params = {'n_estimators' : [100, 200, 400, 800], 'learning_rate' : [.01, .05, .1, .2, .4, .8], 'max_depth' : [5, 7, 8, 10, 15], 'colsample_bytree' : [.5, .6, .8]}
gridXGB, X_trainXGB, X_testXGB, y_trainXGB, y_testXGB = employ_grid_search(df, XGBRegressor(), xgb_params)

In [None]:
# Identify the best model parameters based on score achieved durring cross-validation. It is chosen from the top 5 models
# based on rmse and mae.

for gs in [gridLR, gridRF, gridDT, gridXGB]:
    temp = pd.DataFrame(gs.cv_results_)
    for metric in ["mean_test_neg_mean_absolute_error", "mean_test_neg_root_mean_squared_error"]:
        selected_params = temp.loc[temp.loc[:,metric].apply(lambda x: -x).sort_values(ascending = True).index.values[0:5],"params"]
        for i in selected_params:
            print(i)

In [None]:
#Training the 2 best models (the difference is in the number of features as the input)
#with the optimal values for the hyperparameters as found through grid search.

modelLR = LinearRegression(normalize = True)
modelTree = DecisionTreeRegressor(min_samples_leaf=8, max_depth=10)
modelRF = RandomForestRegressor(n_estimators=400, max_depth=20)
modelXGB = XGBRegressor(colsample_bytree= 0.6, learning_rate= 0.05, max_depth= 8, n_estimators= 800)

modelLR.fit(X_trainLR, y_trainLR)
modelTree.fit(X_trainDT, y_trainDT)
modelRF.fit(X_trainRF, y_trainRF)
modelXGB.fit(X_trainXGB, y_trainXGB)

predsLR = modelLR.predict(X_testLR)
predsTree = modelTree.predict(X_testDT)
predsRF = modelRF.predict(X_testRF)
predsXGB = modelXGB.predict(X_testXGB)

print("Linear Regresion")
print("Mean abs error:",mean_absolute_error(y_testLR, predsLR))
print("RMSE:",mean_squared_error(y_testLR, predsLR, squared=False))
print("-------------------------------------")
print("Decision Tree")
print("Mean abs error:",mean_absolute_error(y_testDT, predsTree))
print("RMSE:",mean_squared_error(y_testDT, predsTree, squared=False))
print("-------------------------------------")
print("Random Forest")
print("Mean abs error:",mean_absolute_error(y_testRF, predsRF))
print("RMSE:",mean_squared_error(y_testRF, predsRF, squared=False))
print("-------------------------------------")
print("XGBoost")
print("Mean abs error:",mean_absolute_error(y_testXGB, predsXGB))
print("RMSE:",mean_squared_error(y_testXGB, predsXGB, squared=False))

In [None]:
#Visualize a part of the predictions against their true values

plt.scatter(x = range(len(predsXGB)), y = (predsXGB - y_testXGB), marker='d')
plt.scatter(x = range(len(predsXGB)), y = y_testXGB)
plt.ylim((0,200))
plt.xlim((100,15))
plt.show()

In [None]:
#Find the indices of the undervalued properties threshold is: pred > 1.3 * value

undervalued = []
for i,pred in enumerate(predsXGB):
    if pred > y_testXGB.values[i]*1.3:
        undervalued.append(i)
    
print("there are",len(undervalued),"undervalued listings on AirBnB.")