# Preparing the Dataset

In [27]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import geopandas as gpd
import geoplot as gplt
from shapely.geometry import Point
import geoplot.crs as gcrs
import contextily as cx
from geopandas.tools import overlay
import mapclassify as mc
import warnings
warnings.filterwarnings('ignore')

In [28]:
# Kept the original dataset
listingsNYC = pd.read_csv('listings.csv', dtype=object)
# Dropping all of the columns which we will not use
# listingsNYC= listingsNYC.drop(['last_review','license','latitude','longitude','host_name'], axis=1)

# Dropping all rows with any 'Nan' values in them
# listingsNYC.dropna(axis=0, how='any', inplace=True)
data_types_dict = {
    'price':'int64', 
    'minimum_nights':'int64', 
    'host_id':str,
    'number_of_reviews':'int64',
    'reviews_per_month':float,
    'calculated_host_listings_count':'int64',
    'availability_365':'int64',
    'number_of_reviews_ltm':'int64',
    'latitude':float,
    'longitude':float
}

#changing all the datatype to their respective types that we can work with
listingsNYC = listingsNYC.astype(data_types_dict)

# creating a price per stay (ppn) column with the price and minimim nights columns and rounding to 2 decimal places. 
# This is a float dtype 
listingsNYC['pps'] = (listingsNYC['price'] * listingsNYC['minimum_nights']).round(2)
listingsNYC['neighbourhood'].unique().shape[0]
listingsNYC.describe()
listingsNYC['annual_revenue'] = (365 - listingsNYC['availability_365']) *  listingsNYC['price']
listingsNYC = listingsNYC[(listingsNYC['price'] != 0)]
listingsNYC.loc[:,'log_price']=np.log(listingsNYC['price'])
listingsNYC.loc[:,'log_pps']=np.log(listingsNYC['pps'])
# listingsNYC['price_bin'] = pd.cut(listingsNYC['price'], bins=range(0, 111, 10), right=False, labels=range(0, 110, 10))
df = listingsNYC[(listingsNYC['price']<2000) & (listingsNYC['price']>20) & (listingsNYC['minimum_nights'] < 30)]
df['price_bin'] = pd.cut(df['price'], bins=range(0, 2001, 20), right=False, labels=range(0, 2000, 20))

In [29]:
price_df = df[['neighbourhood_group','room_type','price']]

brooklyn_home = price_df[(price_df['neighbourhood_group']=='Brooklyn') & (price_df['room_type']=='Entire home/apt')]
brooklyn_private = price_df[(price_df['neighbourhood_group']=='Brooklyn') & (price_df['room_type']=='Private room')]
brooklyn_hotel = price_df[(price_df['neighbourhood_group']=='Brooklyn') & (price_df['room_type']=='Hotel room')]
brooklyn_shared = price_df[(price_df['neighbourhood_group']=='Brooklyn') & (price_df['room_type']=='Shared room')]


manhattan_home = price_df[(price_df['neighbourhood_group']=='Manhattan')  & (price_df['room_type']=='Entire home/apt')]
manhattan_private = price_df[(price_df['neighbourhood_group']=='Manhattan') & (price_df['room_type']=='Private room')]
manhattan_hotel = price_df[(price_df['neighbourhood_group']=='Manhattan') & (price_df['room_type']=='Hotel room')]
manhattan_shared = price_df[(price_df['neighbourhood_group']=='Manhattan') & (price_df['room_type']=='Shared room')]

queens_home = price_df[(price_df['neighbourhood_group']=='Queens')  & (price_df['room_type']=='Entire home/apt')]
queens_private = price_df[(price_df['neighbourhood_group']=='Queens') & (price_df['room_type']=='Private room')]
queens_hotel = price_df[(price_df['neighbourhood_group']=='Queens') & (price_df['room_type']=='Hotel room')]
queens_shared = price_df[(price_df['neighbourhood_group']=='Queens') & (price_df['room_type']=='Shared room')]

bronx_home = price_df[(price_df['neighbourhood_group']=='Bronx')  & (price_df['room_type']=='Entire home/apt')]
bronx_private = price_df[(price_df['neighbourhood_group']=='Bronx') & (price_df['room_type']=='Private room')]
bronx_hotel = price_df[(price_df['neighbourhood_group']=='Bronx') & (price_df['room_type']=='Hotel room')]
bronx_shared = price_df[(price_df['neighbourhood_group']=='Bronx') & (price_df['room_type']=='Shared room')]

staten_island_home = price_df[(price_df['neighbourhood_group']=='Staten Island')  & (price_df['room_type']=='Entire home/apt')]
staten_island_private = price_df[(price_df['neighbourhood_group']=='Staten Island') & (price_df['room_type']=='Private room')]
staten_island_hotel = price_df[(price_df['neighbourhood_group']=='Staten Island') & (price_df['room_type']=='Hotel room')]
staten_island_shared = price_df[(price_df['neighbourhood_group']=='Staten Island') & (price_df['room_type']=='Shared room')]


In [30]:
brooklyn_home_mean = float(brooklyn_home.mean())
brooklyn_private_mean = float(brooklyn_private.mean())
brooklyn_hotel_mean = float(brooklyn_hotel.mean())
brooklyn_shared_mean = float(brooklyn_shared.mean())


manhattan_home_mean = float(manhattan_home.mean())
manhattan_private_mean = float(manhattan_private.mean())
manhattan_hotel_mean = float(manhattan_hotel.mean())
manhattan_shared_mean = float(manhattan_shared.mean())

queens_home_mean = float(queens_home.mean())
queens_private_mean = float(queens_private.mean())
queens_hotel_mean = float(queens_hotel.mean())
queens_shared_mean = float(queens_shared.mean())

bronx_home_mean = float(bronx_home.mean())
bronx_private_mean = float(bronx_private.mean())
# bronx_hotel_mean = float(bronx_hotel.mean())
bronx_shared_mean = float(bronx_shared.mean())

staten_island_home_mean = float(staten_island_home.mean())
staten_island_private_mean = float(staten_island_private.mean())
# staten_island_hotel_mean = float(staten_island_hotel.mean())
staten_island_shared_mean = float(staten_island_shared.mean())

In [31]:
brooklyn_home_std = float(brooklyn_home.std())
brooklyn_private_std = float(brooklyn_private.std())
brooklyn_hotel_std = float(brooklyn_hotel.std())
brooklyn_shared_std = float(brooklyn_shared.std())


manhattan_home_std = float(manhattan_home.std())
manhattan_private_std = float(manhattan_private.std())
manhattan_hotel_std = float(manhattan_hotel.std())
manhattan_shared_std = float(manhattan_shared.std())

queens_home_std = float(queens_home.std())
queens_private_std = float(queens_private.std())
queens_hotel_std = float(queens_hotel.std())
queens_shared_std = float(queens_shared.std())

bronx_home_std = float(bronx_home.std())
bronx_private_std = float(bronx_private.std())
# bronx_hotel_std = float(bronx_hotel.std())
bronx_shared_std = float(bronx_shared.std())

staten_island_home_std = float(staten_island_home.std())
staten_island_private_std = float(staten_island_private.std())
# staten_island_hotel_std = float(staten_island_hotel.std())
staten_island_shared_std = float(staten_island_shared.std())

In [32]:
df['neighborhood_group_room_type_mean'] = [0] * len(df)
df['neighborhood_group_room_type_std'] = [0] * len(df)

In [33]:
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_mean']=brooklyn_home_mean
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Private room'),'neighborhood_group_room_type_mean']=brooklyn_private_mean
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Hotel room'),'neighborhood_group_room_type_mean']=brooklyn_hotel_mean
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_mean']=brooklyn_shared_mean


df.loc[(df['neighbourhood_group']=='Manhattan')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_mean']=manhattan_home_mean
df.loc[(df['neighbourhood_group']=='Manhattan') & (df['room_type']=='Private room'),'neighborhood_group_room_type_mean']=manhattan_private_mean
df.loc[(df['neighbourhood_group']=='Manhattan') & (df['room_type']=='Hotel room'),'neighborhood_group_room_type_mean']=manhattan_hotel_mean
df.loc[(df['neighbourhood_group']=='Manhattan') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_mean']=manhattan_shared_mean

df.loc[(df['neighbourhood_group']=='Queens')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_mean']=queens_home_mean
df.loc[(df['neighbourhood_group']=='Queens') & (df['room_type']=='Private room'),'neighborhood_group_room_type_mean']=queens_private_mean
df.loc[(df['neighbourhood_group']=='Queens') & (df['room_type']=='Hotel room'),'neighborhood_group_room_type_mean']=queens_hotel_mean
df.loc[(df['neighbourhood_group']=='Queens') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_mean']=queens_shared_mean

df.loc[(df['neighbourhood_group']=='Bronx')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_mean']=bronx_home_mean
df.loc[(df['neighbourhood_group']=='Bronx') & (df['room_type']=='Private room'),'neighborhood_group_room_type_mean']=bronx_private_mean
df.loc[(df['neighbourhood_group']=='Bronx') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_mean']=bronx_shared_mean

df.loc[(df['neighbourhood_group']=='Staten Island')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_mean']=staten_island_home_mean
df.loc[(df['neighbourhood_group']=='Staten Island') & (df['room_type']=='Private room'),'neighborhood_group_room_type_mean']=staten_island_private_mean
df.loc[(df['neighbourhood_group']=='Staten Island') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_mean']=staten_island_shared_mean
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,availability_365,number_of_reviews_ltm,license,pps,annual_revenue,log_price,log_pps,price_bin,neighborhood_group_room_type_mean,neighborhood_group_room_type_std
0,5136,"Spacious Brooklyn Duplex, Patio + Garden",7378,Rebecca,Brooklyn,Sunset Park,40.662650,-73.994540,Entire home/apt,275,...,267,1,,5775,26950,5.616771,8.661294,260,235.798218,0
1,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.803800,-73.967510,Private room,75,...,0,0,,150,27375,4.317488,5.010635,60,300.856827,0
3,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.764570,-73.983170,Private room,68,...,79,50,,136,19448,4.219508,4.912655,60,300.856827,0
6,5803,"Lovely Room 1 in BEST AREA; Legal Rental, Spot...",9744,Laurie,Brooklyn,South Slope,40.668010,-73.987840,Private room,124,...,163,14,,496,25048,4.820282,6.206576,120,103.223430,0
8,7097,Perfect for Your Parents: Privacy + Garden,17571,Jane,Brooklyn,Fort Greene,40.691233,-73.972702,Entire home/apt,220,...,38,36,,660,71940,5.393628,6.492240,220,235.798218,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41525,772714221060214808,Good Vibes at The Bronx,421264574,Aridio,Bronx,Unionport,40.828074,-73.847886,Entire home/apt,125,...,60,0,,250,38125,4.828314,5.521461,120,163.684932,0
41526,772716724205003579,2 bedroom Condo near West Village,481177884,Steven,Manhattan,Chelsea,40.740030,-73.997302,Entire home/apt,1114,...,355,0,,3342,11140,7.015712,8.114325,1100,349.695143,0
41527,771961589340472067,Habitación Privada,409771624,Camila,Queens,Sunnyside,40.742223,-73.923310,Private room,124,...,360,0,,248,620,4.820282,5.513429,120,91.758048,0
41528,771962449581256963,Romántico y natural,421601513,Juan Carlos,Manhattan,Washington Heights,40.847271,-73.943419,Private room,80,...,358,0,,400,560,4.382027,5.991465,80,300.856827,0


In [34]:
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_std']=brooklyn_home_std
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Private room'),'neighborhood_group_room_type_std']=brooklyn_private_std
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Hotel room'),'neighborhood_group_room_type_std']=brooklyn_hotel_std
df.loc[(df['neighbourhood_group']=='Brooklyn') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_std']=brooklyn_shared_std


df.loc[(df['neighbourhood_group']=='Manhattan')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_std']=manhattan_home_std
df.loc[(df['neighbourhood_group']=='Manhattan') & (df['room_type']=='Private room'),'neighborhood_group_room_type_std']=manhattan_private_std
df.loc[(df['neighbourhood_group']=='Manhattan') & (df['room_type']=='Hotel room'),'neighborhood_group_room_type_std']=manhattan_hotel_std
df.loc[(df['neighbourhood_group']=='Manhattan') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_std']=manhattan_shared_std

df.loc[(df['neighbourhood_group']=='Queens')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_std']=queens_home_std
df.loc[(df['neighbourhood_group']=='Queens') & (df['room_type']=='Private room'),'neighborhood_group_room_type_std']=queens_private_std
df.loc[(df['neighbourhood_group']=='Queens') & (df['room_type']=='Hotel room'),'neighborhood_group_room_type_std']=queens_hotel_std
df.loc[(df['neighbourhood_group']=='Queens') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_std']=queens_shared_std

df.loc[(df['neighbourhood_group']=='Bronx')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_std']=bronx_home_std
df.loc[(df['neighbourhood_group']=='Bronx') & (df['room_type']=='Private room'),'neighborhood_group_room_type_std']=bronx_private_std
df.loc[(df['neighbourhood_group']=='Bronx') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_std']=bronx_shared_std

df.loc[(df['neighbourhood_group']=='Staten Island')  & (df['room_type']=='Entire home/apt'),'neighborhood_group_room_type_std']=staten_island_home_std
df.loc[(df['neighbourhood_group']=='Staten Island') & (df['room_type']=='Private room'),'neighborhood_group_room_type_std']=staten_island_private_std
df.loc[(df['neighbourhood_group']=='Staten Island') & (df['room_type']=='Shared room'),'neighborhood_group_room_type_std']=staten_island_shared_std
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,availability_365,number_of_reviews_ltm,license,pps,annual_revenue,log_price,log_pps,price_bin,neighborhood_group_room_type_mean,neighborhood_group_room_type_std
0,5136,"Spacious Brooklyn Duplex, Patio + Garden",7378,Rebecca,Brooklyn,Sunset Park,40.662650,-73.994540,Entire home/apt,275,...,267,1,,5775,26950,5.616771,8.661294,260,235.798218,166.921832
1,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.803800,-73.967510,Private room,75,...,0,0,,150,27375,4.317488,5.010635,60,300.856827,358.688003
3,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.764570,-73.983170,Private room,68,...,79,50,,136,19448,4.219508,4.912655,60,300.856827,358.688003
6,5803,"Lovely Room 1 in BEST AREA; Legal Rental, Spot...",9744,Laurie,Brooklyn,South Slope,40.668010,-73.987840,Private room,124,...,163,14,,496,25048,4.820282,6.206576,120,103.223430,97.745150
8,7097,Perfect for Your Parents: Privacy + Garden,17571,Jane,Brooklyn,Fort Greene,40.691233,-73.972702,Entire home/apt,220,...,38,36,,660,71940,5.393628,6.492240,220,235.798218,166.921832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41525,772714221060214808,Good Vibes at The Bronx,421264574,Aridio,Bronx,Unionport,40.828074,-73.847886,Entire home/apt,125,...,60,0,,250,38125,4.828314,5.521461,120,163.684932,91.153067
41526,772716724205003579,2 bedroom Condo near West Village,481177884,Steven,Manhattan,Chelsea,40.740030,-73.997302,Entire home/apt,1114,...,355,0,,3342,11140,7.015712,8.114325,1100,349.695143,263.065421
41527,771961589340472067,Habitación Privada,409771624,Camila,Queens,Sunnyside,40.742223,-73.923310,Private room,124,...,360,0,,248,620,4.820282,5.513429,120,91.758048,69.393339
41528,771962449581256963,Romántico y natural,421601513,Juan Carlos,Manhattan,Washington Heights,40.847271,-73.943419,Private room,80,...,358,0,,400,560,4.382027,5.991465,80,300.856827,358.688003


In [35]:
df.loc[:,'is_budget'] = (df['price'] - df['neighborhood_group_room_type_mean']).abs() / df['neighborhood_group_room_type_std'] < 1
df.loc[:,'is_luxury'] = (df['price'] - df['neighborhood_group_room_type_mean']).abs() / df['neighborhood_group_room_type_std'] > 1
df[['is_budget','is_luxury']].mean()

is_budget    0.899401
is_luxury    0.100461
dtype: float64