In [1]:
# Load Packages
library(tidyverse)
library(broom)
library(leaps)
library(MASS)
library(rsample)

set.seed(999)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘MASS’


The following object is masked from ‘package:dplyr’:

    select




In [2]:
# Load Data 
airbnb <- read_csv('airbnb.csv')
head(airbnb)

[1mRows: [22m[34m13058[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): room_type, city, day_type
[32mdbl[39m (15): realSum, person_capacity, multi, biz, cleanliness_rating, guest_sa...
[33mlgl[39m  (3): room_shared, room_private, host_is_superhost

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,⋯,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,city,day_type
<dbl>,<chr>,<lgl>,<lgl>,<dbl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
474.3175,Entire home/apt,False,False,4,False,0,1,10,91,⋯,1.111996,0.63049095,526.4694,17.94293,915.5871,20.15489,2.17556,41.39624,Barcelona,Weekday
169.8978,Private room,False,True,2,True,1,0,10,88,⋯,1.751839,0.12401717,320.1275,10.91046,794.2774,17.48449,2.14906,41.38714,Barcelona,Weekday
161.9848,Private room,False,True,4,False,0,1,9,88,⋯,1.670493,0.08032198,344.0739,11.72659,840.6736,18.50581,2.15357,41.37859,Barcelona,Weekday
367.9568,Entire home/apt,False,False,3,False,0,1,10,91,⋯,1.475847,0.09310686,400.0574,13.6346,946.5899,20.83736,2.16839,41.3739,Barcelona,Weekday
196.8953,Private room,False,True,3,False,1,0,9,91,⋯,1.855452,0.27248605,346.0422,11.79368,792.296,17.44087,2.15238,41.37699,Barcelona,Weekday
330.9517,Entire home/apt,False,False,3,False,0,1,9,100,⋯,2.565611,0.70129045,391.0874,13.32889,525.5139,11.56818,2.18197,41.40842,Barcelona,Weekday


In [3]:
# Select Relevant Variables
airbnb_clean <- airbnb %>%
    dplyr::select(
        -room_shared,
        -room_private,
        -multi,
        -biz,
        -attr_index,
        -rest_index,
        -lng,
        -lat
    )
head(airbnb_clean)

realSum,room_type,person_capacity,host_is_superhost,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index_norm,rest_index_norm,city,day_type
<dbl>,<chr>,<dbl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
474.3175,Entire home/apt,4,False,10,91,1,1.111996,0.63049095,17.94293,20.15489,Barcelona,Weekday
169.8978,Private room,2,True,10,88,1,1.751839,0.12401717,10.91046,17.48449,Barcelona,Weekday
161.9848,Private room,4,False,9,88,1,1.670493,0.08032198,11.72659,18.50581,Barcelona,Weekday
367.9568,Entire home/apt,3,False,10,91,1,1.475847,0.09310686,13.6346,20.83736,Barcelona,Weekday
196.8953,Private room,3,False,9,91,1,1.855452,0.27248605,11.79368,17.44087,Barcelona,Weekday
330.9517,Entire home/apt,3,False,9,100,2,2.565611,0.70129045,13.32889,11.56818,Barcelona,Weekday


In [4]:
# Full MLR Model
airbnb_mlr <- lm(realSum ~ ., data = airbnb_clean)

airbnb_mlr_results <- tidy(airbnb_mlr)
# airbnb_mlr_results

In [5]:
# Forward & Backward Selection
airbnb_for_selection <- regsubsets(realSum ~ ., nvmax = 12, data = airbnb_clean, method = 'forward')
airbnb_back_selection <- regsubsets(realSum ~ ., nvmax = 12, data = airbnb_clean, method = 'backward')

airbnb_for_selection_summary <- summary(airbnb_for_selection)
airbnb_back_selection_summary <- summary(airbnb_back_selection)

# airbnb_for_selection_summary

In [6]:
# Model Evaluation
airbnb_for_selection_summary_df <- tibble(
    n_input_variables = 1:12,
    RSQ = airbnb_for_selection_summary$rsq,
    RSS = airbnb_for_selection_summary$rss,
    ADJ_R2 = airbnb_for_selection_summary$adjr2,
    Cp = airbnb_for_selection_summary$cp,
    BIC = airbnb_for_selection_summary$bic,
)

# Selected Variables
adjr2_max = which.max(airbnb_for_selection_summary$adjr2)
selected_var_adjr2 <- names(coef(airbnb_for_selection, adjr2_max))[-1]

selected_var_adjr2

In [7]:
# Train/Test Split
split_data <- initial_split(airbnb_clean, prop = 0.75)

train <- training(split_data)
test <- testing(split_data)

In [8]:
# Selected Predictive Model
airbnb_mlr_predictor <- lm(
    realSum ~ room_type + person_capacity + cleanliness_rating + 
    bedrooms + dist + metro_dist + attr_index_norm + rest_index_norm + 
    city + day_type, 
    data = train)

tidy(airbnb_mlr_predictor)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-148.011219,36.1721302,-4.091858,4.313662e-05
room_typePrivate room,-83.746729,8.6814235,-9.646659,6.357707e-22
room_typeShared room,-235.676451,34.0135311,-6.928903,4.50863e-12
person_capacity,46.987402,3.4712094,13.53632,2.248736e-41
cleanliness_rating,12.332175,3.3402191,3.692026,0.0002236854
bedrooms,107.178143,6.4194866,16.69575,1.002443e-61
dist,8.518565,2.9149766,2.922344,0.003482029
metro_dist,-13.327915,11.6710172,-1.141967,0.2534958
attr_index_norm,4.520671,0.6198898,7.292701,3.273757e-13
rest_index_norm,2.524601,0.4349978,5.80371,6.688112e-09
