# Capstone Modeling Script


In [14]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats

# Import regression modules
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels import regression
from patsy import dmatrices
from sklearn.model_selection import train_test_split

#allow all columns to be viewed:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
current_path = % pwd
#If current_path is in Scripts folder, 
#go up one level so we can open the data folder
if (current_path.rsplit('\\', 1)[1] == 'Scripts'):
    % cd ..

D:\Coding Projects\Springboard\Springboard_Projects\Capstone


In [3]:
#Import data

#Read in AirBnB dataset that has been merged with summarized Yelp information:
#... (number of businesses, total reviews, and average star rating for businesses w/in .1 and .5 miles)
path1='../Capstone/Data/abb_stat_inf_changes.csv'
abb = pd.read_csv(path1)

print(abb.shape)
abb.head()

(16011, 73)


Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,neighborhood,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,cancellation_policy,square_feet_notNA,monthly_price_notNA,weekly_price_notNA,bathrooms_notNA,beds_notNA,bedrooms_notNA,security_deposit_notNA,cleaning_fee_notNA,host_response_rate_notNA,reviews_per_month_notNA,neighbourhood_notNA,host_neighbourhood_notNA,neighbourhood_cleansed_notNA,host_response_time_notNA,host_is_superhost_notNA,has_Wifi,has_Heating,has_Essentials,has_Kitchen,has_Smoke_detector,has_Air_conditioning,has_Hangers,has_Washer,has_Dryer,has_Shampoo,has_TV,has_Familykid_friendly,has_Elevator,has_Free_parking_on_premises,has_Internet,has_Gym,has_Cable_TV,has_Paid_parking_off_premises,has_Pool,has_Hot_tub,has_Pets_allowed,has_Breakfast,has_Buzzerwireless_intercom,has_Indoor_fireplace,has_Free_street_parking,has_Wheelchair_accessible,has_Doorman,has_Pets_live_on_this_property,has_Smoking_allowed,host_lives_near_listing,yelp_bus_count_1,yelp_bus_total_reviews_1,yelp_bus_avg_rating_1,yelp_bus_count_5,yelp_bus_total_reviews_5,yelp_bus_avg_rating_5,log_price,Utilization_Rate
0,1419,within an hour,0.0,f,Little Portugal,House,Entire home/apt,6.0,2.0,3.0,4.0,Real Bed,470.0,1000.0,150.0,1.0,0.0,4.0,f,strict_14_with_grace_period,True,False,True,True,True,True,True,True,False,True,True,True,True,False,True,True,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1.0,66.0,3.5,116.0,8634.0,3.758621,6.152733,0.013053
1,10314,within an hour,0.0,f,Riverdale,House,Private room,2.0,1.0,1.0,1.0,Real Bed,69.0,0.0,0.0,2.0,20.0,1.0,f,moderate,False,True,True,True,True,True,False,False,False,True,True,True,True,False,True,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,,,,,,,4.234107,0.042601
2,12604,within an hour,0.0,f,The Annex,House,Private room,1.0,1.5,1.0,1.0,Pull-out Sofa,65.0,130.0,26.0,1.0,20.0,1.0,f,moderate,True,True,True,True,True,True,True,True,False,False,True,True,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,,,,11.0,1631.0,3.045455,4.174387,0.000866
3,17936,within an hour,100.0,t,Kensington Market,Apartment,Private room,4.0,1.0,1.0,2.0,Real Bed,99.0,300.0,80.0,1.0,20.0,2.0,f,strict_14_with_grace_period,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,20.0,1440.0,3.65,160.0,11434.0,3.715625,4.59512,0.105691
4,23691,within an hour,100.0,t,Wychwood,House,Private room,2.0,1.0,1.0,1.0,Real Bed,70.0,0.0,0.0,2.0,25.0,1.0,t,strict_14_with_grace_period,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,,,,11.0,147.0,3.363636,4.248495,0.092562


## Drop Unecessary Variables

### Drop Uneeded Amenities
Earlier in the Statistical Inference script I ran a series of t-tests to determine whether each Amenity has a statistically significant relationship to Price and/or Utilization Rate.

I plan to drop Amenities that did not have a statistically significant relationship. My desired p-value to determine statistical significance is 0.05. However I know that when making multiple comparisons, it is often very easy to have tests that pass the threshold purely by chance. The Bonferroni correction is a simple way to avoid this issue. 

The number of comparisons being made is 29, so I know my actual p-value threshold needs to be (0.05/29=) 0.0017. I will drop any amenities that did not pass that threshold. 

Note: There were differences in which Amenities were statistically significant against Price vs. Utilization Rate. At this point I will create two datasets, one to predict Price, the other Utilization Rate, so I can keep the variables with the best relationship to each outcome. 

In [4]:
#Create copies of dataset
abb_Price_predict = abb.copy()
abb_Util_predict = abb.copy()

In [5]:
#Drop amenity variables with a p-value greater than 0.0017 from Price prediction dataset
drop_cols = ['has_Wifi', 'has_Doorman', 'has_Indoor_fireplace', 'has_Familykid_friendly', 'has_Wheelchair_accessible',
            'has_Hot_tub', 'has_Hangers', 'has_Buzzerwireless_intercom', 'has_Shampoo']

abb_Price_predict.drop(columns=drop_cols, axis=1, inplace=True)

abb_Price_predict.shape

(16011, 64)

In [6]:
#Drop amenity variables with a p-value greater than 0.0017 from Utilization_Rate prediction dataset
drop_cols = ['has_Hangers', 'has_Smoking_allowed', 'has_Pool', 'has_Hot_tub', 'has_Air_conditioning', 'has_Elevator',
            'has_Gym', 'has_Familykid_friendly', 'has_Free_parking_on_premises', 'has_Smoke_detector', 'has_Wheelchair_accessible',
            'has_Pets_allowed', 'has_Indoor_fireplace', 'has_Paid_parking_off_premises', 'has_Breakfast', 'has_Doorman',
            'has_Essentials', 'has_Free_street_parking', 'has_Pets_live_on_this_property', 'has_Shampoo', 'has_Buzzerwireless_intercom']

abb_Util_predict.drop(columns=drop_cols, axis=1, inplace=True)

abb_Util_predict.shape

(16011, 52)

### Drop Uneeded "NA" Flag Variables

Now I will do the same for the \_notNA variables, which are flag variables I created during data wrangling to tell me if a value was missing for a given variable. I did a series of t-tests on these against Price and Utilization to see which had a statistically significant relationship. 

This time the number of comparisons was 10, so the p-value threshold they need to pass for significance is (0.05/10=) 0.005.

Note: Technically there were 15 \_notNA variables tested in the Statistical Inference script, but this was a mistake. 5 of those had no NA values (likely the records with missing values were dropped over time for those) so they had no actual data to test. Those variables will be dropped below as well. 

In [7]:
#Drop _notNA variables with a p-value greater than 0.0017 from Price prediction dataset
drop_cols = ['bathrooms_notNA', 'beds_notNA', 'bedrooms_notNA', 'neighbourhood_cleansed_notNA', 'host_is_superhost_notNA',
            'monthly_price_notNA', 'weekly_price_notNA', 'neighbourhood_notNA', 'host_response_rate_notNA', 'host_response_time_notNA',
            'host_neighbourhood_notNA', 'square_feet_notNA']

abb_Price_predict.drop(columns=drop_cols, axis=1, inplace=True)

abb_Price_predict.shape

(16011, 52)

In [8]:
#Drop _notNA variables with a p-value greater than 0.0017 from Utilization Rate prediction dataset
drop_cols = ['bathrooms_notNA', 'beds_notNA', 'bedrooms_notNA', 'neighbourhood_cleansed_notNA', 'host_is_superhost_notNA',
            'square_feet_notNA', 'weekly_price_notNA', 'host_neighbourhood_notNA', 'monthly_price_notNA']

abb_Util_predict.drop(columns=drop_cols, axis=1, inplace=True)

abb_Util_predict.shape

(16011, 43)

### Drop Yelp Business Metrics w/out Statistical Significance, Create Flag Variables for Having a Business w/in .1 and .5 Miles

Next I will drop the Yelp Business metrics that did not show a statistically significant correlation with Price/Utilization (or if the correlation was extremely low). 

As a reminder, the Yelp Business metrics are Number of Businesses, Total Reviews, and Avg. Star Rating. These were calculated for all businesses within .1 and .5 miles of each AirBnB listing, for 6 variables total. 

All three of the metrics for busineses within .1 mile had a very low correlation with both Price and Utilization. I will drop all three of the .1 mile metrics. However when doing a t-test comparing the Price of listings with at least one business within .1 mile and those that did not, there was a large and statistically significant price difference (same for those with/without businesses within .5 mile). So before dropping the .1 mile metrics I will create a flag variable that is True if the AirBnB listing has a business within .1 (and .5) miles.

In [9]:
#Create flag variable indicating if any business was within .1, .5 miles of an AirBnB listing
abb_Price_predict['Bus_in_pt_1'] = pd.notna(abb_Price_predict['yelp_bus_count_1'])
abb_Price_predict['Bus_in_pt_5'] = pd.notna(abb_Price_predict['yelp_bus_count_5'])

abb_Util_predict['Bus_in_pt_1'] = pd.notna(abb_Util_predict['yelp_bus_count_1'])
abb_Util_predict['Bus_in_pt_5'] = pd.notna(abb_Util_predict['yelp_bus_count_5'])

In [10]:
#Drop .1 mile Yelp Business variables
drop_cols = ['yelp_bus_count_1', 'yelp_bus_total_reviews_1', 'yelp_bus_avg_rating_1']

abb_Price_predict.drop(columns=drop_cols, axis=1, inplace=True)
abb_Util_predict.drop(columns=drop_cols, axis=1, inplace=True)

print(abb_Price_predict.shape)
print(abb_Util_predict.shape)

(16011, 51)
(16011, 42)


The .5 mile Yelp Business metrics, specifically Number of Businesses and Total Reviews, did show both a practical correlation coefficient and statistical significance when compared to Price, so those variables will be kept in the Price prediction dataset. The Avg. Rating was not practically significant against Price, so it will be dropped.

All three .5 mile Yelp Business metrics were not practically significant against Utilization Rate and will be dropped. 

In [11]:
#Drop .5 mile Yelp Business variables from price prediction dataset
drop_cols = ['yelp_bus_avg_rating_5']

abb_Price_predict.drop(columns=drop_cols, axis=1, inplace=True)

print(abb_Price_predict.shape)

(16011, 50)


In [12]:
#Drop .5 mile Yelp Business variables from utilization prediction dataset
drop_cols = ['yelp_bus_count_5', 'yelp_bus_total_reviews_5', 'yelp_bus_avg_rating_5']

abb_Util_predict.drop(columns=drop_cols, axis=1, inplace=True)

print(abb_Util_predict.shape)

(16011, 39)


As a final step I will replace NaN values in the Yelp metrics with 0 to ensure the modeling algorithms do not drop those records.

In [13]:
abb_Price_predict[['yelp_bus_count_5', 'yelp_bus_total_reviews_5']] = abb_Price_predict[['yelp_bus_count_5', 'yelp_bus_total_reviews_5']].fillna(0.0)

## Predicting Price

### Produce Baseline Price Prediction Model
First I will create a simple baseline linear regression model predicting raw price (not log price) with all of the predictor variables. This will serve as my baseline model to compare future models against. I will use the ols function from the statsmodel package because it gives a nice, very readable output of the important metrics I need for comparison. 

In [14]:
abb_Price_predict.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,neighborhood,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,instant_bookable,cancellation_policy,security_deposit_notNA,cleaning_fee_notNA,reviews_per_month_notNA,has_Heating,has_Essentials,has_Kitchen,has_Smoke_detector,has_Air_conditioning,has_Washer,has_Dryer,has_TV,has_Elevator,has_Free_parking_on_premises,has_Internet,has_Gym,has_Cable_TV,has_Paid_parking_off_premises,has_Pool,has_Pets_allowed,has_Breakfast,has_Free_street_parking,has_Pets_live_on_this_property,has_Smoking_allowed,host_lives_near_listing,yelp_bus_count_5,yelp_bus_total_reviews_5,log_price,Utilization_Rate,Bus_in_pt_1,Bus_in_pt_5
0,1419,within an hour,0.0,f,Little Portugal,House,Entire home/apt,6.0,2.0,3.0,4.0,Real Bed,470.0,1000.0,150.0,1.0,0.0,4.0,f,strict_14_with_grace_period,True,True,True,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,116.0,8634.0,6.152733,0.013053,True,True
1,10314,within an hour,0.0,f,Riverdale,House,Private room,2.0,1.0,1.0,1.0,Real Bed,69.0,0.0,0.0,2.0,20.0,1.0,f,moderate,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0.0,0.0,4.234107,0.042601,False,False
2,12604,within an hour,0.0,f,The Annex,House,Private room,1.0,1.5,1.0,1.0,Pull-out Sofa,65.0,130.0,26.0,1.0,20.0,1.0,f,moderate,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,True,11.0,1631.0,4.174387,0.000866,False,True
3,17936,within an hour,100.0,t,Kensington Market,Apartment,Private room,4.0,1.0,1.0,2.0,Real Bed,99.0,300.0,80.0,1.0,20.0,2.0,f,strict_14_with_grace_period,True,True,True,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,160.0,11434.0,4.59512,0.105691,True,True
4,23691,within an hour,100.0,t,Wychwood,House,Private room,2.0,1.0,1.0,1.0,Real Bed,70.0,0.0,0.0,2.0,25.0,1.0,t,strict_14_with_grace_period,True,True,True,False,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,11.0,147.0,4.248495,0.092562,False,True


In [15]:
#Create formula components
X_cols = "+".join(abb_Price_predict.columns.difference(['id', 'price', 'Utilization_Rate', 'log_price']))
                  
formula = 'price~' + X_cols

In [None]:
#Create Train-Test split


In [16]:
#Fit baseline model
base_m = ols(formula, abb_Price_predict).fit()

print(base_m.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.497
Model:                            OLS   Adj. R-squared:                  0.493
Method:                 Least Squares   F-statistic:                     152.4
Date:                Thu, 08 Nov 2018   Prob (F-statistic):               0.00
Time:                        21:17:06   Log-Likelihood:                -90062.
No. Observations:               16011   AIC:                         1.803e+05
Df Residuals:                   15907   BIC:                         1.811e+05
Df Model:                         103                                         
Covariance Type:            nonrobust                                         
                                                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

My model has many input variables so I will use Adjusted R-Squared to evaluate the model. The baseline is decent at .493, indicating that around half of the variation in the outcome variable is explained by this model. However there are more variables than I would like (103), making the model difficult to interpret and explain to others, and the output report indicates there may be strong multicollinearity. Feature selection will need to be performed to fine-tune this model. 

### Perform Feature Selection Procedures

#### Fit Ridge Regression Model

In [None]:
base_m_Ridge = ols(formula, abb_Price_predict).fit_regularized(alpha=0)

In [33]:
print("Variable Count: " + str(len(base_m_Ridge.params[base_m_Ridge.params > 0].sort_values(ascending=False))))
print(base_m_Ridge.params[base_m_Ridge.params > 0].sort_values(ascending=False))

Variable Count: 73
cancellation_policy[T.super_strict_60]      116.315049
property_type[T.Bed and breakfast]           54.633668
neighborhood[T.Entertainment District]       49.811163
neighborhood[T.Yorkville]                    49.245816
neighborhood[T.Rosedale]                     43.872724
neighborhood[T.Saint Lawrence]               43.671907
neighborhood[T.Cabbagetown]                  43.421963
neighborhood[T.Trinity-Bellwoods]            38.413566
neighborhood[T.Downtown Core]                37.687185
neighborhood[T.Kensington Market]            36.893053
bathrooms                                    36.179355
neighborhood[T.Niagara]                      35.391616
neighborhood[T.South Hill]                   35.271830
cancellation_policy[T.super_strict_30]       32.931891
neighborhood[T.Yonge Eglinton]               32.616101
property_type[T.Loft]                        29.578103
bedrooms                                     22.474160
neighborhood[T.Little Portugal]              2

In [35]:
ridge_vars = base_m_Ridge.params[base_m_Ridge.params > 0]

In [36]:
ridge_vars

Intercept                                    16.895835
Bus_in_pt_1[T.True]                           2.643508
Bus_in_pt_5[T.True]                           3.157171
bed_type[T.Couch]                            16.845357
cancellation_policy[T.super_strict_30]       32.931891
cancellation_policy[T.super_strict_60]      116.315049
has_Air_conditioning[T.True]                  4.603224
has_Breakfast[T.True]                        10.556566
has_Cable_TV[T.True]                          2.054093
has_Elevator[T.True]                          1.110841
has_Free_parking_on_premises[T.True]          1.356854
has_Free_street_parking[T.True]               3.936342
has_Gym[T.True]                               1.743249
has_Pets_live_on_this_property[T.True]        2.201870
has_Pool[T.True]                              7.816734
has_Smoke_detector[T.True]                    2.307830
has_Smoking_allowed[T.True]                   4.481305
has_TV[T.True]                               10.737865
has_Washer

#### Fit LASSO Model

In [None]:
base_m_Lasso = ols(formula, abb_Price_predict).fit_regularized(alpha=1)

In [34]:
print("Variable Count: " + str(len(base_m_Lasso.params[base_m_Lasso.params > 0].sort_values(ascending=False))))
print(base_m_Lasso.params[base_m_Lasso.params > 0].sort_values(ascending=False))

Variable Count: 14
bathrooms                                 36.991190
bedrooms                                  19.601264
accommodates                              14.426329
has_TV[T.True]                            12.187740
neighborhood[T.Entertainment District]    12.181129
property_type[T.Condominium]               6.493346
Bus_in_pt_1[T.True]                        5.035644
neighborhood[T.Downtown Core]              4.861018
has_Pool[T.True]                           2.117577
cleaning_fee                               0.373759
security_deposit                           0.031282
yelp_bus_count_5                           0.009186
host_response_rate                         0.005079
yelp_bus_total_reviews_5                   0.001727
dtype: float64


#### Fit Random Forest

#### Check Variance Inflation Factors for Multi-Collinearity

In [17]:
#X = abb_Price_predict.drop(['id', 'price', 'Utilization_Rate', 'log_price'], axis=1)
#features = "+".join(abb_Price_predict.columns - [['id', 'price', 'Utilization_Rate', 'log_price']])
y, X = dmatrices('price~' + X_cols, abb_Price_predict, return_type='dataframe')

In [18]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [19]:
vif.round(1)

Unnamed: 0,VIF Factor,features
0,582.2,Intercept
1,1.8,Bus_in_pt_1[T.True]
2,1.3,Bus_in_pt_5[T.True]
3,1.9,bed_type[T.Couch]
4,4.5,bed_type[T.Futon]
5,3.4,bed_type[T.Pull-out Sofa]
6,7.7,bed_type[T.Real Bed]
7,1.7,cancellation_policy[T.moderate]
8,1.9,cancellation_policy[T.strict_14_with_grace_per...
9,1.0,cancellation_policy[T.super_strict_30]


## Predicting Utilization Rate

### Produce Baseline Utilization Rate Prediction Model

#### First Calculate Price vs. Predicted Price Delta