In [14]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
sns.set(style="white")

In [15]:
df = pd.read_pickle("../../Datasets/Data_viz_project/airbnb_listings_price_normalized")

In [24]:
df = df.reset_index()

In [29]:
dummy = pd.get_dummies(df["Country"], drop_first=True)

In [30]:
dummy = dummy.reset_index()

In [42]:
set(df["Country"])

{'Australia',
 'Austria',
 'Belgium',
 'Canada',
 'Denmark',
 'France',
 'Germany',
 'Greece',
 'Hong Kong',
 'Ireland',
 'Italy',
 'Netherlands',
 'Spain',
 'Switzerland',
 'United Kingdom',
 'United States'}

In [46]:
df.columns

Index(['Listing ID', 'Name', 'Host ID', 'Host Name', 'Host Response Rate',
       'Host Is Superhost', 'Host total listings count', 'Country', 'latitude',
       'longitude', 'Property type', 'Room type', 'Accommodates', 'Bathrooms',
       'Bedrooms', 'Amenities', 'Price', 'Minimum nights', 'Maximum nights',
       'Availability 365', 'Calendar last scraped', 'Number of reviews',
       'Last Review Date', 'Review Scores Rating', 'Review Scores Accuracy',
       'Review Scores Cleanliness', 'Review Scores Checkin',
       'Review Scores Communication', 'Review Scores Location',
       'Review Scores Value', 'Reviews per month', 'Amenity_Count',
       'Country_Average', 'Normalized', 'Austria', 'Belgium', 'Canada',
       'Denmark', 'France', 'Germany', 'Greece', 'Hong Kong', 'Ireland',
       'Italy', 'Netherlands', 'Spain', 'Switzerland', 'United Kingdom',
       'United States', 'Const'],
      dtype='object')

In [34]:
df = df.merge(dummy, on="index").drop(columns=("index"))

In [35]:
df["Const"] = 1

In [51]:
X = df[["Amenity_Count", "Accommodates", "Const", "Austria", 
        "Belgium", "Canada", "Denmark", 
        "France", "Germany", "Greece", 
        "Hong Kong", "Ireland", "Italy",
       "Netherlands", "Spain", "Switzerland",
       "United Kingdom", "United States"]]
y = df["Normalized"]

model = sm.OLS(y, X).fit()


model.summary()

0,1,2,3
Dep. Variable:,Normalized,R-squared:,0.363
Model:,OLS,Adj. R-squared:,0.363
Method:,Least Squares,F-statistic:,12730.0
Date:,"Wed, 29 Apr 2020",Prob (F-statistic):,0.0
Time:,13:19:56,Log-Likelihood:,-360600.0
No. Observations:,380224,AIC:,721200.0
Df Residuals:,380206,BIC:,721400.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Amenity_Count,0.0073,0.000,41.702,0.000,0.007,0.008
Accommodates,0.2291,0.001,443.640,0.000,0.228,0.230
Const,0.1313,0.005,27.796,0.000,0.122,0.141
Austria,-0.0211,0.008,-2.573,0.010,-0.037,-0.005
Belgium,0.0596,0.009,6.587,0.000,0.042,0.077
Canada,-0.0006,0.005,-0.112,0.911,-0.011,0.010
Denmark,0.1074,0.007,16.254,0.000,0.094,0.120
France,0.0690,0.005,14.317,0.000,0.060,0.078
Germany,0.1572,0.006,24.908,0.000,0.145,0.170

0,1,2,3
Omnibus:,271221.057,Durbin-Watson:,1.705
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10570324.751
Skew:,2.987,Prob(JB):,0.0
Kurtosis:,28.13,Cond. No.,301.0


### Although the R value is better in this model, it is still not of statistical significance. We will next try adding the GDP of each country to see if that positively affects the model

GDP per capita data pulled from Wikipedia. We used the 2018 World Bank data because it is the closest in time to our estimated time frame from the data: https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28PPP%29_per_capita

In [52]:
GDP = {'Australia':51663,
 'Austria':55455,
 'Belgium':51408,
 'Canada':48130,
 'Denmark':55671,
 'France':45342,
 'Germany':53075,
 'Greece':29592,
 'Hong Kong':64597,
 'Ireland':83203,
 'Italy':41830,
 'Netherlands':56329,
 'Spain':39715,
 'Switzerland':68061,
 'United Kingdom':45973,
 'United States':62795}

In [53]:
GDP_list = [GDP[x] for x in df["Country"].to_list()]

In [56]:
df = df.assign(GDP=GDP_list)

In [59]:
X = df[["Amenity_Count", "Accommodates", "Const","GDP"]]
y = df["Normalized"]

model = sm.OLS(y, X).fit()


model.summary()

0,1,2,3
Dep. Variable:,Normalized,R-squared:,0.354
Model:,OLS,Adj. R-squared:,0.354
Method:,Least Squares,F-statistic:,69340.0
Date:,"Wed, 29 Apr 2020",Prob (F-statistic):,0.0
Time:,13:30:48,Log-Likelihood:,-363290.0
No. Observations:,380224,AIC:,726600.0
Df Residuals:,380220,BIC:,726600.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Amenity_Count,0.0054,0.000,31.624,0.000,0.005,0.006
Accommodates,0.2249,0.001,436.089,0.000,0.224,0.226
Const,0.1264,0.006,20.310,0.000,0.114,0.139
GDP,7.726e-07,1.12e-07,6.914,0.000,5.54e-07,9.92e-07

0,1,2,3
Omnibus:,268382.526,Durbin-Watson:,1.682
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10167364.151
Skew:,2.948,Prob(JB):,0.0
Kurtosis:,27.637,Cond. No.,324000.0


### Here we see that GDP is not as strongly corrolated at dummy variables for each country. Next we will see if inbound tourism has an impact. Data was pulled from the UN Tourism department: https://www.e-unwto.org/doi/pdf/10.18111/9789284421152
We are looking at the international tourist arrivals by 1,000 in 2017 (data starts on page 17)

In [60]:
Tourism = {'Australia':8815,
 'Austria':29460,
 'Belgium':8358,
 'Canada':20883,
 'Denmark':11743,
 'France':86918,
 'Germany':37452,
 'Greece':27194,
 'Hong Kong':27885,
 'Ireland':10338,
 'Italy':58253,
 'Netherlands':17924,
 'Spain':81869,
 'Switzerland':11133,
 'United Kingdom':37651,
 'United States':76941}

In [61]:
Tourism_list = [Tourism[x] for x in df["Country"].to_list()]

In [62]:
df = df.assign(Tourism=Tourism_list)

In [78]:
X = df[["Amenity_Count", "Accommodates", "Const", "Tourism", "Austria", 
        "Belgium", "Canada", "Denmark", 
        "France", "Germany", "Greece", 
        "Hong Kong", "Ireland", "Italy",
       "Netherlands", "Spain", "Switzerland",
       "United Kingdom", "United States"]]
y = df["Normalized"]

model = sm.OLS(y, X).fit()


model.summary()

0,1,2,3
Dep. Variable:,Normalized,R-squared:,0.363
Model:,OLS,Adj. R-squared:,0.363
Method:,Least Squares,F-statistic:,12700.0
Date:,"Wed, 29 Apr 2020",Prob (F-statistic):,0.0
Time:,14:05:04,Log-Likelihood:,-359660.0
No. Observations:,379055,AIC:,719400.0
Df Residuals:,379037,BIC:,719600.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Amenity_Count,0.0073,0.000,41.629,0.000,0.007,0.008
Accommodates,0.2292,0.001,443.223,0.000,0.228,0.230
Const,0.1329,0.005,25.421,0.000,0.123,0.143
Tourism,-2.417e-07,7.34e-08,-3.291,0.001,-3.86e-07,-9.77e-08
Austria,-0.0163,0.008,-2.158,0.031,-0.031,-0.001
Belgium,0.0598,0.009,6.597,0.000,0.042,0.078
Canada,0.0024,0.005,0.493,0.622,-0.007,0.012
Denmark,0.1081,0.007,16.617,0.000,0.095,0.121
France,0.0879,0.003,26.426,0.000,0.081,0.094

0,1,2,3
Omnibus:,270291.652,Durbin-Watson:,1.705
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10517865.405
Skew:,2.985,Prob(JB):,0.0
Kurtosis:,28.106,Cond. No.,5.74e+19


## This model is still not statistically relevant enough to base predictions off of. We decide to abandon this idea and move on to the US only data, to see if location is a larger predictor there. 