### Imports

In [11]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

c = pd.read_csv('./Airbnb Seattle Listings/calendar.csv')
l = pd.read_csv('./Airbnb Seattle Listings/listings.csv')
r = pd.read_csv('./Airbnb Seattle Listings/reviews.csv')
price = pd.read_csv('./Airbnb Seattle Listings/house_price.csv', dtype={'zipcode':object})
rent = pd.read_csv('./Airbnb Seattle Listings/rent.csv', encoding='latin-1', dtype={'id':object,'State_Code':object,'State_Name':object,'State_ab':object,'County':object,'City':object,'Place':object,'Type':object,'Primary':object,'Zip_Code':object,'Area_Code':object,'ALand':object,'AWater':object,'Lat':object, 'Lon':object, 'Mean':int, 'Median':int, 'Stdev':int,'Samples':int})

### Homogenize attributes

In [2]:
# Price string to float
l.price = l.price.replace('[\$,]', '', regex=True).astype(float)
# remove entries with corrupt zipcode
l = l[l.zipcode != "99\n98122"]
# rename attribute to match other dataframes
l = l.rename(columns={'id': 'listing_id'})
rent = rent.rename(columns={'Zip_Code': 'zipcode'})

# index dataframes by listing_id
# c = c.set_index('listing_id')
# l = l.set_index('zipcode')
# rent = rent.set_index('zipcode')
# r = r.set_index('listing_id')

### Preview Data

In [14]:
len(pd.merge(l, price, how='inner',on='zipcode'))

1110094

In [None]:
price.head()

### Create dataframe with useful data

In [None]:
total_sq_ft = len(l.square_feet)
nan_sq_ft = total_sq_ft - l.square_feet.count()
print('\nsquare_feet attribute is vastly Nan, not very useful')
print('Total square_feet entries: {}, Total square_feet NaN: {}'.format(total_sq_ft, nan_sq_ft))


data = l[['price', 'review_scores_location', 'square_feet', 'property_type', 'bedrooms','bathrooms','accommodates', 'zipcode', 'host_neighbourhood']]
data.head()

### Group by neighbourhood

In [None]:
by_neighbourhood = data.groupby(['host_neighbourhood'])

hoods = data.host_neighbourhood.dropna().unique()

h_prices = by_neighbourhood.price.mean()
h_loc_reviews = by_neighbourhood.review_scores_location.mean()

h_prices.head()


### Grouping by Zipcode

In [None]:
by_zip = data.groupby(['zipcode'])
zips = data.zipcode.dropna().unique()
z_prices = by_zip.price.mean()
z_loc_reviews = by_zip.review_scores_location.mean()

z_prices.head()

### Plot Both Groupings

In [None]:
fig, xarr = plt.subplots(4,1, figsize=(35,45))
fig.subplots_adjust(hspace=1)
  
xarr[0].bar(zips, z_prices)
xarr[0].set_title('Mean Price per Zipcode')

xarr[1].bar(zips, z_loc_reviews)
xarr[1].set_title('Mean Location Review per Zipcode')

xarr[2].bar(hoods, h_prices, width=0.5)
xarr[2].set_title('Mean Price per Neighbourhood');

xarr[3].bar(hoods, h_loc_reviews, width=0.5)
xarr[3].set_title('Mean Location Reviews per Neighbourhood');

for i in range(4):
    plt.sca(xarr[i])
    xarr[i].title.set_size(45)
    fontsize = 25 if i < 2 else 15
    plt.xticks(rotation='vertical', fontsize=fontsize)


### Attempt to find relationship between price and location review

In [None]:
z_price_loc_ratio = z_prices*z_loc_reviews
h_price_loc_ratio = h_prices*h_loc_reviews

fig, xarr = plt.subplots(2,1, figsize=(35,20))

xarr[0].bar(zips, z_price_loc_ratio)
xarr[0].set_title('Price over Location Review per Zipcode')
xarr[0].set_ylabel('Price/Location Review')

xarr[1].bar(hoods, h_price_loc_ratio)
xarr[1].set_title('Price over Location Review per Neighbourhood')
xarr[1].set_ylabel('Price/Location Review')

for i in range(2):
    plt.sca(xarr[i])
    xarr[i].title.set_size(25)
    xarr[i].yaxis.label.set_size(25)
    xarr[i].set_xticks([])

In [None]:
fig, xarr = plt.subplots(2,2, figsize=(35,50))

xarr[0,0].bar(zips, z_prices, alpha = 0.5, color='r')
xarr[0,1].bar(zips, z_price_loc_ratio, alpha = 0.5, color='b')

xarr[1,0].bar(hoods, h_prices, alpha = 0.5, color='r')
xarr[1,1].bar(hoods, h_price_loc_ratio, alpha = 0.5, color='b')



In [None]:
len(l)