# Creating the San Diego (SD) Dataset

#### From InsideAirbnb and POI-Factory, information about Airbnb listings, booking dates, and local attractions will be synthesized into a single dataset that can later be used for modeling and visualizations.  The data will be filtered, joined, and transformed primarily via the Pandas and GeoPandas python libraries.

## Requirements and imports 

In [1]:
! pip install -r requirements.txt



In [2]:
import pandas as pd
import geopandas as gpd

## Read in the Listings, Neighbourhoods, and Booking Calendar Data

In [3]:
SD_cal = pd.read_csv('./San_Diego/calendar.csv.gz', compression='gzip',)
SD_lis = pd.read_csv('./San_Diego/listings.csv')
SD_nei = pd.read_csv('./San_Diego/neighbourhoods.csv').fillna(0)

display(SD_cal.head(5))
display(SD_lis.head(5))
display(SD_nei.head(5))


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,585426,2021-07-15,f,$265.00,$265.00,3.0,1125.0
1,1072842,2021-07-15,f,$140.00,$140.00,2.0,1125.0
2,1072842,2021-07-16,f,$175.00,$175.00,2.0,1125.0
3,1072842,2021-07-17,f,$175.00,$175.00,2.0,1125.0
4,1072842,2021-07-18,f,$140.00,$140.00,2.0,1125.0


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,5570,Ocean front condo on the sand,8435,Jef Karchin,,Mission Bay,32.78481,-117.25313,Entire home/apt,2050,3,0,,,3,165
1,29967,"Great home, 10 min walk to Beach",129123,Michael,,Pacific Beach,32.80751,-117.2576,Entire home/apt,242,4,66,2021-06-30,0.49,5,209
2,38245,Point Loma: Den downstairs,164137,Melinda,,Roseville,32.74217,-117.21931,Private room,74,1,143,2019-10-20,1.46,3,325
3,54001,"La Jolla Cottage Blocks to Ocn; 2Bdms, 1Bth.",252692,Marsha,,La Jolla,32.81301,-117.26856,Entire home/apt,167,5,264,2021-07-11,5.59,2,62
4,62274,"charming, colorful, close to beach",302986,Isabel,,Pacific Beach,32.80583,-117.24244,Entire home/apt,83,1,659,2021-06-27,6.15,2,288


Unnamed: 0,walk_score,transit_score,bike_score,neighbourhood
0,42.0,28.0,30.0,Allied Gardens
1,49.0,58.0,69.0,Alta Vista
2,0.0,0.0,0.0,Amphitheater And Water Park
3,21.0,46.0,30.0,Balboa Park
4,0.0,0.0,0.0,Bario Logan


## Clean the Dataset and Add Relevant Features 

In [4]:
def my_szn(date):
    mon = date.month
    day = date.day
    
    if mon < 3 or mon == 3 and day <=20:
        szn='Winter'
    elif mon < 6 or mon == 6 and day <=20:
        szn = 'Spring'
    elif mon < 9 or mon == 9 and day <=20:
        szn = 'Summer'
    elif mon < 12 or mon == 12 and day <=20:
        szn = 'Fall'
    else:
        szn='Winter'
    
    return szn


In [5]:
###############################################################################################################################
# Add in Seasonality markers such as Month, Season, Day of Week, etc. 
###############################################################################################################################
SD_cal['date'] = pd.to_datetime(SD_cal['date'], infer_datetime_format=True)
SD_cal['month'] = SD_cal['date'].dt.month
SD_cal['qtr'] = SD_cal['date'].dt.quarter
SD_cal['is_weekend'] = SD_cal['date'].dt.weekday >= 4
SD_cal['dayofweek'] = SD_cal['date'].dt.dayofweek
SD_cal['szn'] = SD_cal.apply(lambda x: my_szn(x['date']), axis=1 )

In [6]:
###############################################################################################################################
# Pivot the Calendar data so that each listing appears on only 1 row (primary key==listing_id)
###############################################################################################################################

SD_cal_piv = SD_cal.groupby(['listing_id','szn','is_weekend','available'])\
            .agg({'price':'count'}).reset_index()\
            .pivot(['listing_id'],['szn','is_weekend','available'],['price'])\
            .reset_index().fillna(0)

SD_cal_piv.columns = SD_cal_piv.columns.to_flat_index()
SD_cal_piv.columns = ['listing_id',
 'Fall_weekday_f', 'Fall_weekday_t', 'Fall_weekend_f', 'Fall_weekend_t',
 'Spring_weekday_f', 'Spring_weekday_t', 'Spring_weekend_f', 'Spring_weekend_t',
 'Summer_weekday_f', 'Summer_weekday_t', 'Summer_weekend_f', 'Summer_weekend_t',
 'Winter_weekday_f', 'Winter_weekday_t', 'Winter_weekend_f', 'Winter_weekend_t']

###############################################################################################################################
# Convert Binary indicator of Booked (cols ending in _t or _f) to a single to a vacancy metric based on Season and Day of Week
###############################################################################################################################

SD_cal_piv['Fall_weekday'] = SD_cal_piv['Fall_weekday_t']/(SD_cal_piv['Fall_weekday_t']+SD_cal_piv['Fall_weekday_f'])
SD_cal_piv['Fall_weekend'] = SD_cal_piv['Fall_weekend_t']/(SD_cal_piv['Fall_weekend_t']+SD_cal_piv['Fall_weekend_f'])

SD_cal_piv['Spring_weekday'] = SD_cal_piv['Spring_weekday_t']/(SD_cal_piv['Spring_weekday_t']+SD_cal_piv['Spring_weekday_f'])
SD_cal_piv['Spring_weekend'] = SD_cal_piv['Spring_weekend_t']/(SD_cal_piv['Spring_weekend_t']+SD_cal_piv['Spring_weekend_f'])

SD_cal_piv['Summer_weekday'] = SD_cal_piv['Summer_weekday_t']/(SD_cal_piv['Summer_weekday_t']+SD_cal_piv['Summer_weekday_f'])
SD_cal_piv['Summer_weekend'] = SD_cal_piv['Summer_weekend_t']/(SD_cal_piv['Summer_weekend_t']+SD_cal_piv['Summer_weekend_f'])

SD_cal_piv['Winter_weekday'] = SD_cal_piv['Winter_weekday_t']/(SD_cal_piv['Winter_weekday_t']+SD_cal_piv['Winter_weekday_f'])
SD_cal_piv['Winter_weekend'] = SD_cal_piv['Winter_weekend_t']/(SD_cal_piv['Winter_weekend_t']+SD_cal_piv['Winter_weekend_f'])

###############################################################################################################################
# Convert the Binary indicators to a vacancy metric for Season only
###############################################################################################################################

SD_cal_piv['Fall'] = SD_cal_piv[[ 'Fall_weekday_t', 'Fall_weekend_t',]].sum(axis=1)/(
                        SD_cal_piv[[ 'Fall_weekday_f', 'Fall_weekday_t', 'Fall_weekend_f', 'Fall_weekend_t',]].sum(axis=1))
SD_cal_piv['Spring'] = SD_cal_piv[[ 'Spring_weekday_t', 'Spring_weekend_t',]].sum(axis=1)/(
                        SD_cal_piv[[ 'Spring_weekday_f', 'Spring_weekday_t', 'Spring_weekend_f', 'Spring_weekend_t',]].sum(axis=1))
SD_cal_piv['Summer'] = SD_cal_piv[[ 'Summer_weekday_t', 'Summer_weekend_t',]].sum(axis=1)/(
                        SD_cal_piv[[ 'Summer_weekday_f', 'Summer_weekday_t', 'Summer_weekend_f', 'Summer_weekend_t',]].sum(axis=1))
SD_cal_piv['Winter'] = SD_cal_piv[[ 'Winter_weekday_t', 'Winter_weekend_t',]].sum(axis=1)/(
                        SD_cal_piv[[ 'Winter_weekday_f', 'Winter_weekday_t', 'Winter_weekend_f', 'Winter_weekend_t']].sum(axis=1))

###############################################################################################################################
# Convert the Binary indicators to a vacancy metric for Weekday vs Weekend only
###############################################################################################################################
SD_cal_piv['Weekday'] = SD_cal_piv[['Fall_weekday_t','Spring_weekday_t','Summer_weekday_t','Winter_weekday_t',]].sum(axis=1)/(
                        SD_cal_piv[['Fall_weekday_t','Spring_weekday_t','Summer_weekday_t','Winter_weekday_t',]].sum(axis=1)+
                        SD_cal_piv[['Fall_weekday_f','Spring_weekday_f','Summer_weekday_f','Winter_weekday_f',]].sum(axis=1))

SD_cal_piv['Weekend'] = SD_cal_piv[['Fall_weekend_t','Spring_weekend_t','Summer_weekend_t','Winter_weekend_t',]].sum(axis=1)/(
                        SD_cal_piv[['Fall_weekend_t','Spring_weekend_t','Summer_weekend_t','Winter_weekend_t',]].sum(axis=1)+
                        SD_cal_piv[['Fall_weekend_f','Spring_weekend_f','Summer_weekend_f','Winter_weekend_f',]].sum(axis=1))


###############################################################################################################################
# Create a grand total Vacancy metric
###############################################################################################################################
SD_cal_piv['NoVacancy']=SD_cal_piv[['Fall_weekday_t','Spring_weekday_t','Summer_weekday_t','Winter_weekday_t',
                                    'Fall_weekend_t','Spring_weekend_t','Summer_weekend_t','Winter_weekend_t',]].sum(axis=1)/(
                        SD_cal_piv[['Fall_weekday_t','Spring_weekday_t','Summer_weekday_t','Winter_weekday_t',
                                    'Fall_weekend_t','Spring_weekend_t','Summer_weekend_t','Winter_weekend_t',]].sum(axis=1)+
                        SD_cal_piv[['Fall_weekday_f','Spring_weekday_f','Summer_weekday_f','Winter_weekday_f',
                                    'Fall_weekend_f','Spring_weekend_f','Summer_weekend_f','Winter_weekend_f',]].sum(axis=1))

SD_cal_piv['Occupancy']=SD_cal_piv[['Fall_weekday_t','Spring_weekday_t','Summer_weekday_t','Winter_weekday_t',
                                    'Fall_weekend_t','Spring_weekend_t','Summer_weekend_t','Winter_weekend_t',]].sum(axis=1)


###############################################################################################################################
# Trim down columns
###############################################################################################################################
SD_cal_piv = SD_cal_piv[['listing_id','Occupancy','Fall_weekday','Fall_weekend','Spring_weekday','Spring_weekend',
                         'Summer_weekday','Summer_weekend','Winter_weekday','Winter_weekend',
                         'Fall','Spring','Summer','Winter',
                         'Weekday','Weekend','NoVacancy']]

SD_cal_piv

Unnamed: 0,listing_id,Occupancy,Fall_weekday,Fall_weekend,Spring_weekday,Spring_weekend,Summer_weekday,Summer_weekend,Winter_weekday,Winter_weekend,Fall,Spring,Summer,Winter,Weekday,Weekend,NoVacancy
0,5570,165.0,0.403846,0.358974,0.924528,0.923077,0.207547,0.179487,0.294118,0.307692,0.384615,0.923913,0.195652,0.300000,0.459330,0.442308,0.452055
1,29967,209.0,0.096154,0.128205,1.000000,1.000000,0.264151,0.256410,0.921569,0.923077,0.109890,1.000000,0.260870,0.922222,0.569378,0.576923,0.572603
2,38245,325.0,1.000000,1.000000,1.000000,1.000000,0.603774,0.512821,1.000000,1.000000,1.000000,1.000000,0.565217,1.000000,0.899522,0.878205,0.890411
3,54001,62.0,0.057692,0.076923,0.000000,0.000000,0.037736,0.025641,0.568627,0.615385,0.065934,0.000000,0.032609,0.588889,0.162679,0.179487,0.169863
4,62274,288.0,0.903846,0.743590,1.000000,1.000000,0.433962,0.256410,0.960784,0.974359,0.835165,1.000000,0.358696,0.966667,0.822967,0.743590,0.789041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9097,51025904,77.0,0.230769,0.230769,0.000000,0.000000,0.641509,0.564103,0.000000,0.000000,0.230769,0.000000,0.608696,0.000000,0.220096,0.198718,0.210959
9098,51027262,11.0,0.000000,0.000000,0.000000,0.000000,0.094340,0.153846,0.000000,0.000000,0.000000,0.000000,0.119565,0.000000,0.023923,0.038462,0.030137
9099,51033701,358.0,1.000000,1.000000,1.000000,1.000000,0.924528,0.923077,1.000000,1.000000,1.000000,1.000000,0.923913,1.000000,0.980861,0.980769,0.980822
9100,51034402,365.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000


In [7]:
###############################################################################################################################
# Join the Calendar data to the Listing data and the Neighbourhood data
###############################################################################################################################
SD_df = SD_cal_piv.merge(SD_lis, left_on='listing_id', right_on='id', how='inner', suffixes=('','_lis'))
SD_df = SD_df.merge(SD_nei, on=['neighbourhood'], how='inner', suffixes=('','_nei')).drop_duplicates()
print(SD_df.columns, SD_df.shape)
SD_df.head(5)


Index(['listing_id', 'Occupancy', 'Fall_weekday', 'Fall_weekend',
       'Spring_weekday', 'Spring_weekend', 'Summer_weekday', 'Summer_weekend',
       'Winter_weekday', 'Winter_weekend', 'Fall', 'Spring', 'Summer',
       'Winter', 'Weekday', 'Weekend', 'NoVacancy', 'id', 'name', 'host_id',
       'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365', 'walk_score',
       'transit_score', 'bike_score'],
      dtype='object') (9102, 36)


Unnamed: 0,listing_id,Occupancy,Fall_weekday,Fall_weekend,Spring_weekday,Spring_weekend,Summer_weekday,Summer_weekend,Winter_weekday,Winter_weekend,...,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,walk_score,transit_score,bike_score
0,5570,165.0,0.403846,0.358974,0.924528,0.923077,0.207547,0.179487,0.294118,0.307692,...,2050,3,0,,,3,165,0.0,0.0,0.0
1,182428,339.0,1.0,1.0,1.0,1.0,0.735849,0.692308,1.0,1.0,...,210,3,6,2020-05-31,0.07,30,339,0.0,0.0,0.0
2,211583,135.0,0.538462,0.512821,0.226415,0.205128,0.037736,0.025641,0.745098,0.666667,...,280,3,178,2021-06-29,3.65,1,135,0.0,0.0,0.0
3,519849,181.0,0.865385,0.923077,0.207547,0.230769,0.509434,0.435897,0.411765,0.384615,...,200,4,124,2021-06-28,2.59,1,181,0.0,0.0,0.0
4,560227,209.0,0.942308,0.923077,0.867925,0.923077,0.45283,0.461538,0.0,0.0,...,429,30,2,2012-11-24,0.02,2,209,0.0,0.0,0.0


## Read in the POI Factory Data 

In [8]:
SD_poi_base = pd.read_csv("./San_Diego/SanDiegoPOI.csv",header=None)[[0,1,2]]
SD_poi_golf = pd.read_csv("./San_Diego/San Diego Golf Courses.csv",header=None)[[0,1,2]]
SD_poi_trol = pd.read_csv("./San_Diego/San Diego Trolley Stops.csv",header=None)[[0,1,2]]
SD_poi_bank = pd.read_csv("./San_Diego/San Diego County Credit Union.csv",header=None)[[0,1,2]]
SD_poi_kite = pd.read_csv("./San_Diego/KiteSDCA.csv",header=None)[[0,1,2]]

SD_poi = pd.concat([SD_poi_base, SD_poi_golf, SD_poi_trol,SD_poi_bank, SD_poi_kite])

SD_poi.columns = ['POI_LNG','POI_LAT','POI_name']
SD_poi

Unnamed: 0,POI_LNG,POI_LAT,POI_name
0,-117.195856,32.753347,Old Town State Park
1,-117.197750,32.731489,Airport (SAN) - Terminal 1
2,-117.200980,32.731566,Airport (SAN) - Terminal 2
3,-117.196815,32.725154,Harbor Island
4,-117.223222,32.717944,Shelter Island
...,...,...,...
1,-117.138740,32.625740,Silver Strand Beach
2,-117.132500,32.576000,Imperial Beach
3,-117.262600,32.804900,Tourmaline Beach
4,-117.216000,32.774100,Stinkies


## Convert Pandas to GeoPandas, Project the Data to the Correct EPSG Codes, Join Datasets

In [9]:
###############################################################################################################################
# The EPSG code used for California is EPSG:3310 which is the California Albers.  For more info, see blog post.
###############################################################################################################################
 
SD_poi_g = gpd.GeoDataFrame(SD_poi, geometry=gpd.points_from_xy(SD_poi.POI_LNG, SD_poi.POI_LAT), crs="EPSG:4326")
SD_poi_g = SD_poi_g.to_crs("EPSG:3310")
SD_poi_g.crs

<Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [10]:
###############################################################################################################################
# In order to join in the next cell, both EPSG projections must be the same.
###############################################################################################################################
SD_df_g = gpd.GeoDataFrame(SD_df, geometry=gpd.points_from_xy(SD_df.longitude, SD_df.latitude), crs="EPSG:4326")
SD_df_g = SD_df_g.to_crs("EPSG:3310")
SD_df_g.crs

<Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [11]:
###############################################################################################################################
# Add a 1600 meter Buffer around each Listing point (1600m = .99 miles). 
# Do a Spatial Within Join to match any POI point within the 1 mile buffer around each Listing point.
###############################################################################################################################
 
SD_df_g['geometry'] = SD_df_g['geometry'].buffer(1600)
SD_f = gpd.sjoin(SD_poi_g, SD_df_g, how='right', op='within').sort_values(by=['id','POI_name'])
SD_f


Unnamed: 0,index_left,POI_LNG,POI_LAT,POI_name,listing_id,Occupancy,Fall_weekday,Fall_weekend,Spring_weekday,Spring_weekend,...,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,walk_score,transit_score,bike_score,geometry
0,86.0,-117.256690,32.796314,Crystal Pier in Pacific Beach,5570,165.0,0.403846,0.358974,0.924528,0.923077,...,3,0,,,3,165,0.0,0.0,0.0,"POLYGON ((259235.365 -577126.801, 259227.660 -..."
0,12.0,-117.250950,32.772034,Mission Beach area of San Diego,5570,165.0,0.403846,0.358974,0.924528,0.923077,...,3,0,,,3,165,0.0,0.0,0.0,"POLYGON ((259235.365 -577126.801, 259227.660 -..."
0,11.0,-117.256748,32.796590,Pacific Beach area of San Diego,5570,165.0,0.403846,0.358974,0.924528,0.923077,...,3,0,,,3,165,0.0,0.0,0.0,"POLYGON ((259235.365 -577126.801, 259227.660 -..."
0,13.0,-117.251575,32.771701,Roller Coaster at Mission Beach,5570,165.0,0.403846,0.358974,0.924528,0.923077,...,3,0,,,3,165,0.0,0.0,0.0,"POLYGON ((259235.365 -577126.801, 259227.660 -..."
0,5.0,-117.240100,32.784900,Sail Bay,5570,165.0,0.403846,0.358974,0.924528,0.923077,...,3,0,,,3,165,0.0,0.0,0.0,"POLYGON ((259235.365 -577126.801, 259227.660 -..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3619,24.0,-117.157125,32.706452,Petco Park (Home of the Padres),51036524,148.0,1.000000,1.000000,0.000000,0.000000,...,31,0,,,7,148,0.0,0.0,0.0,"POLYGON ((268786.280 -585309.988, 268778.575 -..."
3619,23.0,-117.160603,32.706831,San Diego Convention Center,51036524,148.0,1.000000,1.000000,0.000000,0.000000,...,31,0,,,7,148,0.0,0.0,0.0,"POLYGON ((268786.280 -585309.988, 268778.575 -..."
3619,120.0,-117.144108,32.699281,San Diego to Coronado Bridge,51036524,148.0,1.000000,1.000000,0.000000,0.000000,...,31,0,,,7,148,0.0,0.0,0.0,"POLYGON ((268786.280 -585309.988, 268778.575 -..."
3619,83.0,-117.169491,32.709135,Sea Port Village,51036524,148.0,1.000000,1.000000,0.000000,0.000000,...,31,0,,,7,148,0.0,0.0,0.0,"POLYGON ((268786.280 -585309.988, 268778.575 -..."


In [12]:
###############################################################################################################################
# Some code brought in from outside, sourced by nkmartin
###############################################################################################################################
###############################################################################################################################
# Haversine Distance calculations are very computationally expesive and usually require a full cross join
# This project uses it only for validation purposes of the spatial join used above
###############################################################################################################################

import math
def caluclate_distance_haversine(store_a_longitude, store_a_latitude, store_b_longitude, store_b_latitude):

    # Calculate Latitude and Longitude Difference
    latitude_difference = (store_b_latitude - store_a_latitude) * math.pi / 180.0
    longitude_difference = (store_b_longitude - store_a_longitude) * math.pi / 180.0
  
    # Convert decimal degrees to radians 
    store_a_latitude = (store_a_latitude) * math.pi / 180.0
    store_b_latitude = (store_b_latitude) * math.pi / 180.0
  
    # Compute Haversine Formula
    a = (pow(math.sin(latitude_difference / 2), 2) + 
         pow(math.sin(longitude_difference / 2), 2) * 
             math.cos(store_a_latitude) * math.cos(store_b_latitude)); 
    rad = 6371.0
    c = 2 * math.asin(math.sqrt(a)) 
    return rad * c 

## Verify with a simple Visualization 

In [13]:
###############################################################################################################################
# Take a small subset of the data for graphing purposes
###############################################################################################################################
fol_tmp = SD_f[['POI_LAT','POI_LNG','POI_name','id','latitude','longitude']]
fol_tmp = fol_tmp[fol_tmp['id'].isin([146886,67520])]
fol_tmp['hav_dist_km'] = fol_tmp.apply(lambda x: 
                                    caluclate_distance_haversine(float(x.POI_LAT), float(x.POI_LNG), float(x.latitude),float(x.longitude))
                                    , axis=1)
fol_tmp['hav_dist_mi'] = fol_tmp['hav_dist_km'] * 0.621371
fol_tmp = fol_tmp[(fol_tmp['hav_dist_mi']<=1)]
fol_tmp.sort_values('id').dropna()

Unnamed: 0,POI_LAT,POI_LNG,POI_name,id,latitude,longitude,hav_dist_km,hav_dist_mi
3080,32.705513,-117.153099,12th & Imperial Transit Center,67520,32.7113,-117.15166,0.334424,0.207801
3080,32.699281,-117.144108,San Diego to Coronado Bridge,67520,32.7113,-117.15166,1.037803,0.644861
3080,32.706831,-117.160603,San Diego Convention Center,67520,32.7113,-117.15166,1.019954,0.63377
3080,32.706452,-117.157125,Petco Park (Home of the Padres),67520,32.7113,-117.15166,0.655595,0.407368
3080,32.710922,-117.153809,Park & Market,67520,32.7113,-117.15166,0.239727,0.148959
3080,32.711341,-117.160148,Gaslamp District,67520,32.7113,-117.15166,0.943825,0.586465
3080,32.716743,-117.159683,Fifth Avenue,67520,32.7113,-117.15166,0.933905,0.580301
3080,32.707012,-117.160072,Gaslamp Quarter,67520,32.7113,-117.15166,0.960353,0.596736
3080,32.716724,-117.162834,Civic Center,67520,32.7113,-117.15166,1.272623,0.790771
3080,32.716187,-117.15406,City College,67520,32.7113,-117.15166,0.364307,0.22637


In [14]:
###############################################################################################################################
# Plot first the POI points, then the 1 mile buffer, then the airbnb listing
###############################################################################################################################
import folium
mapit=None
mapit = folium.Map(location=[32.71130,-117.15166], zoom_start=14)

tile = folium.TileLayer(
        tiles = 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}',
        attr = 'Esri',
        name = 'Esri Satellite',
        overlay = False,
        control = True
       ).add_to(mapit)

small = 250 

print("Blue area is the 1 mile radius around each Listing")
fol_tmp.head(small).drop_duplicates().apply(lambda row:folium.Circle(location=[row["latitude"], row["longitude"]], 
                                              radius=1600, fill_color='blue', fill_opacity=0.01)
                                             .add_to(mapit), axis=1)
print("Red points are the Listings")
fol_tmp.head(small).drop_duplicates().apply(lambda row:folium.Circle(location=[row["latitude"], row["longitude"]], 
                                              radius=20, fill_color='red', color='red', fill_opacity=1)
                                             .add_to(mapit), axis=1)
print("Orange points are the POI")
fol_tmp.head(small).drop_duplicates().apply(lambda row:folium.Circle(location=[row["POI_LAT"], row["POI_LNG"]], 
                                              radius=10, fill_color='orange', color='orange',fill_opacity=1)
                                             .add_to(mapit) if row["POI_LAT"] else 0, axis=1)

Blue area is the 1 mile radius around each Listing
Red points are the Listings
Orange points are the POI


3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
3080    <folium.vector_layers.Circle object at 0x7f153...
4618    <folium.vector_layers.Circle object at 0x7f153...
4618    <folium.vector_layers.Circle object at 0x7f153...
4618    <folium.vector_layers.Circle object at 0x7f153...
4618    <foliu

In [15]:
mapit

## Run a GB to Count the POI by Listing

In [16]:
SD_f.groupby(['listing_id','neighbourhood','room_type','Occupancy','NoVacancy'])\
        .agg({'POI_name':pd.Series.nunique}).reset_index()\
        .sort_values(by=['POI_name'], ascending=False).rename(columns={'POI_name':'Num_POI_within_1mile'})

Unnamed: 0,listing_id,neighbourhood,room_type,Occupancy,NoVacancy,Num_POI_within_1mile
6621,44265022,Core,Entire home/apt,85.0,0.232877,22
1074,9154346,Gaslamp Quarter,Entire home/apt,234.0,0.641096,22
3058,22421133,East Village,Entire home/apt,32.0,0.087671,22
2170,17515996,Marina,Entire home/apt,180.0,0.493151,22
8236,49657581,Marina,Entire home/apt,114.0,0.312329,22
...,...,...,...,...,...,...
3393,24585480,Normal Heights,Private room,364.0,0.997260,0
1545,13301630,Lynwood Hills,Private room,195.0,0.534247,0
8101,49371592,Southwest,Entire home/apt,116.0,0.317808,0
8100,49369290,Normal Heights,Entire home/apt,47.0,0.128767,0


## Save as Tall table with each row representing a POI-Listing Match

In [17]:
SD_f.to_csv("./SanDiego_v1.csv.gz", compression='gzip')