# Hawaii Airbnb and POI data

Essentially, this notebook would be used to scrape through the Hawaii data and incorporate heatmaps. Utilizing both airbnb and POI data, we can see the attractions nearby certain airbnb neighborhoods. 

In [None]:
# Libraries to download
import pandas as pd
import sys
import scipy
import numpy as np
try:
    import folium
except:
    !{sys.executable} -m pip install folium
    import folium
from folium import plugins
from folium.plugins import MarkerCluster
try:
    import altair as alt
except:
    !{sys.executable} -m pip install altair
    import altair as alt

Getting the Hawaii dataset. This is mostly the "detailed" listings csv from InsideAirbnb data, and gives more details/more listing ids for us to use.

In [2]:
airbnb_data = pd.read_csv('airbnb/hawaii_listings.csv')
airbnb_data['count'] = 1
airbnb_data

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,count
0,5065,https://www.airbnb.com/rooms/5065,2.021070e+13,7/9/2021,MAUKA BB,"Perfect for your vacation, Staycation or just ...",Neighbors here are friendly but are not really...,https://a0.muscache.com/pictures/36718112/1f0e...,7257,https://www.airbnb.com/users/show/7257,...,4.48,4.76,,f,1,1,0,0,0.41,1
1,5269,https://www.airbnb.com/rooms/5269,2.021070e+13,7/9/2021,Upcountry Hospitality in the 'Auwai Suite,"The 'Auwai Suite is a lovely, self-contained a...","We are located on the ""sunny side"" of Waimea, ...",https://a0.muscache.com/pictures/5b52b72f-5a09...,7620,https://www.airbnb.com/users/show/7620,...,5.00,4.82,119-269-5808-01R,f,3,3,0,0,0.10,1
2,5387,https://www.airbnb.com/rooms/5387,2.021070e+13,7/9/2021,Hale Koa Studio & 1 Bedroom Units!!,This Wonderful Spacious Studio apt/flat is in ...,IN a Farm belt area with small commercial farm...,https://a0.muscache.com/pictures/1170713/dca6a...,7878,https://www.airbnb.com/users/show/7878,...,4.72,4.74,,t,3,3,0,0,1.66,1
3,5389,https://www.airbnb.com/rooms/5389,2.021070e+13,7/9/2021,Keauhou Villa,It is less than 10 minute walk to the Keauhou ...,It is less than 10 minute walk to the Keauhou ...,https://a0.muscache.com/pictures/15520396/3b89...,7878,https://www.airbnb.com/users/show/7878,...,4.97,4.74,,f,3,3,0,0,0.57,1
4,5390,https://www.airbnb.com/rooms/5390,2.021070e+13,7/9/2021,STAY AT PRINCE KUHIO!,"<b>The space</b><br />Prince Kuhio, Studio Uni...",,https://a0.muscache.com/pictures/12955/af97ac5...,7887,https://www.airbnb.com/users/show/7887,...,4.90,4.67,42652226,f,1,1,0,0,2.07,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21803,50898372,https://www.airbnb.com/rooms/50898372,2.021070e+13,7/9/2021,1 Bedroom Ocean Front Condo in Honokowai - Sle...,"Ocean Front Ecstasy! Homey w/Tile Floor, Full ...",,https://a0.muscache.com/pictures/prohost-api/H...,118578746,https://www.airbnb.com/users/show/118578746,...,,,GE-140-773-5808-01,t,137,137,0,0,,1
21804,50898551,https://www.airbnb.com/rooms/50898551,2.021070e+13,7/9/2021,1 Bedroom Ocean Front Condo in Honokowai - Sle...,This 1-bedroom oceanfront suite includes all t...,,https://a0.muscache.com/pictures/prohost-api/H...,118578746,https://www.airbnb.com/users/show/118578746,...,,,,t,137,137,0,0,,1
21805,50900058,https://www.airbnb.com/rooms/50900058,2.021070e+13,7/9/2021,Hawaii Life Rentals Presents Island Sands 110 ...,"This beautiful, ground-floor honeymoon condo i...",,https://a0.muscache.com/pictures/prohost-api/H...,680107,https://www.airbnb.com/users/show/680107,...,,,,t,137,137,0,0,,1
21806,50900602,https://www.airbnb.com/rooms/50900602,2.021070e+13,7/9/2021,Maui Resort Rentals: Honua Kai – Incredible 6 ...,This stunning offering combines a stunning 3br...,With a prime beachfront location on West Maui'...,https://a0.muscache.com/pictures/prohost-api/H...,39073224,https://www.airbnb.com/users/show/39073224,...,,,175-718-1952-01,t,261,257,4,0,,1


Getting attraction info from POI factory. Mostly locations that are recommendations for tourists to use.

The dataset is located in http://www.poi-factory.com/node/21535, where we combined the files we found valuable into one master csv.

In [7]:
usapoi_data = pd.read_csv('poi_factory/poi_hawaii_data.csv')
usapoi_data

Unnamed: 0,longitude,latitude,name,info
0,-159.492460,21.885750,Allerton National Botanical Garden,
1,-159.475740,22.212730,Hanalei Valley Lookout,
2,-159.562670,21.919870,Hanapepe Lookout,
3,-159.335350,22.042570,Hikinaakala Heiau,
4,-159.645670,22.150340,Kalalau Lookout,
...,...,...,...,...
365,-157.968056,21.321667,Fort Kamehameha Military Reservation (historical),"Pearl Harbor, Honolulu"
366,-157.982222,21.320556,Fort Weaver (historical),"Pearl Harbor, Honolulu"
367,-157.953889,21.371389,Battery Adair (historical),"Pearl Harbor, Honolulu"
368,-157.969167,21.367222,Battery Boyd (historical),"Pearl Harbor, Honolulu"


### Folium

Now that we got the two datasets, we can see both have geo spatial data with longitude and latitude. Folium is an excellent resource that allows us visualize these geo data with interactive maps. Thus, for this problem, we create a folium chart that incorporates the airbnb listings with a heatmap, and the poi_factory data as clusters.

In [8]:
map1 = folium.Map(location=[37, -121], zoom_start=3)

stationArr = airbnb_data[['latitude', 'longitude', 'count']].values


#Commented out to save space. Make sure your computer can handle folium charts!
map1.add_children(plugins.HeatMap(stationArr, radius=15))

mc = MarkerCluster()

for index, row in usapoi_data.iterrows():
    mc.add_child(folium.Marker(location=[row['latitude'], row['longitude']]))
    
map1.add_child(mc)

#Illustrating clusters (count of individual places rows) with heatmap
map1

  import sys


We see that most of the popular tourist sites have a LOT of listings inside them. This allows us to sort of understand what creates the "interest" for hosts and customers to choose these locations. If we zoom even closer, a lot of the attractions are located within the edges of the islands, most likely because of the beaches and the tropical view. 

Likewise, we can see a lot of the hosts are clumped into the edges through the map. When we view the specific areas, we can see that most of the hosts also like to choose near the edges, or at least outside mountains or trails. 

Another great tool we utilized to visualize this data was through kepler. An example is shown in the code folders.

https://kepler.gl/demo

### However, what about distance?


Since we can at least see the places that are close to each other, we decided to try utilizing the data provided and calculating some distance between each hosting place with the full attractions. This allows us to add an additional feature for what creates "success" within a hosting location.

Here are some resources that we studied:

https://stackoverflow.com/questions/20303323/distance-calculation-between-rows-in-pandas-dataframe-using-a-distance-matrix
https://stackoverflow.com/questions/56115205/euclidean-distance-between-two-pandas-dataframes
https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/

But the one we utilized was:
https://stackoverflow.com/questions/40452759/pandas-latitude-longitude-to-distance-between-successive-rows

In [10]:
only_places = airbnb_data[['longitude', 'latitude']].reset_index(drop=True)
pd.Series(only_places.iloc[0])

longitude   -155.43259
latitude      20.04266
Name: 0, dtype: float64

In [11]:
usapoi_data[['longitude', 'latitude']]

Unnamed: 0,longitude,latitude
0,-159.492460,21.885750
1,-159.475740,22.212730
2,-159.562670,21.919870
3,-159.335350,22.042570
4,-159.645670,22.150340
...,...,...
365,-157.968056,21.321667
366,-157.982222,21.320556
367,-157.953889,21.371389
368,-157.969167,21.367222


In [3]:
# Function to get a "distance number" between one location to another based on long/lat
def haversine_vectorize(lon1, lat1, lon2, lat2):

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    newlon = lon2 - lon1
    newlat = lat2 - lat1

    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2

    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

Once we got the resources from both datasets, we tried testing with just a single hosting to the whole attractions table.

In [18]:
test_list = []
for num in range(0, len(usapoi_data)):
    test_list.append(haversine_vectorize(only_places.loc[0]['longitude'], only_places.loc[0]['latitude'],
                       usapoi_data[['longitude', 'latitude']].loc[num]['longitude'],
                       usapoi_data[['longitude', 'latitude']].loc[num]['latitude']))

In [19]:
test_list

[468.4031181562836,
 483.4796115116867,
 476.567460568302,
 461.7356435239378,
 495.58933376490484,
 475.5134834796393,
 461.015989269884,
 477.81874757050747,
 492.89013248650366,
 460.03269840130014,
 464.0181255333325,
 495.7023708657661,
 494.09557221667274,
 462.3290461468105,
 490.8273840272582,
 469.072099790683,
 465.2991661943444,
 463.6793507985997,
 493.13611132246024,
 468.52614172261895,
 294.73515879045954,
 286.86667698798215,
 310.94764115262507,
 291.0929732699078,
 292.8176082808279,
 281.3507226229362,
 281.5299736952639,
 316.9184979739205,
 310.89914876195195,
 271.036227827739,
 272.2388394278195,
 289.1710595133221,
 284.4494991785337,
 288.6100177883399,
 291.7833427548787,
 282.9452816497701,
 270.9857868576424,
 298.98944125452203,
 298.4600093365735,
 284.8734538764463,
 281.8629826191515,
 286.09085593713644,
 313.3397531353473,
 287.8649887591773,
 271.4863418004957,
 285.8060652051469,
 324.141285398414,
 299.13400870391683,
 299.1569341763447,
 299.156934

In [20]:
len([a for a in test_list if a < 100])

30

Basically, within that one host listing, they had 30 attractions relatively nearby! For every 1, that should be around 1km, so our logic was to find any attractions within a 100 km radius. 


Now that we confirm that works, we have to do it with all those listings! Because the process would take several hours to compile, we separated the airbnb into three csvs and ran them with the function created. After all the distances are produced, we would combine them into the master csv.

In [21]:
# Run through and receive the closest attractions within each listing
def get_number_of_close_attractions(area_series):
    all_distances = []
    for num in range(0, len(usapoi_data)):
        all_distances.append(haversine_vectorize(area_series['longitude'], area_series['latitude'],
                           usapoi_data[['longitude', 'latitude']].loc[num]['longitude'],
                           usapoi_data[['longitude', 'latitude']].loc[num]['latitude']))

    return len([close_stuff for close_stuff in all_distances if close_stuff < 100])

In [22]:
import tqdm

In [23]:
listing_1 = pd.read_csv('airbnb/listings_1.csv')
only_places1 = listing_1[['longitude', 'latitude']].reset_index(drop=True)

listing_2 = pd.read_csv('airbnb/listings_2.csv')
only_places2 = listing_2[['longitude', 'latitude']].reset_index(drop=True)

listing_3 = pd.read_csv('airbnb/listings_3.csv')
only_places3 = listing_3[['longitude', 'latitude']].reset_index(drop=True)

The following line runs the function, and would take around 1 hour to run one listing file. 

In [29]:
# test_list3 = []

# for num in tqdm.tqdm(range(0, len(only_places3))):
#     test_list3.append(get_number_of_close_attractions(only_places3.loc[num]))

100%|██████████████████████████████████████████████████████████████████████████████| 7137/7137 [53:57<00:00,  2.20it/s]


In [30]:
# len(test_list3)

7137

We write them into csvs then combine them.

In [None]:
# only_places1['closeness_attactions'] = test_list1
# only_places1.to_csv('test1_closeness.csv')

In [28]:
# only_places2['closeness_attactions'] = test_list2
# only_places2.to_csv('test2_closeness.csv')

In [31]:
# only_places3['closeness_attactions'] = test_list3
# only_places3.to_csv('test3_closeness.csv')

But why did we get the distance area? Well, we wanted to define the success within these locations, so having this as a feature allows us to continue towards our next step with models. 

## Archive

In [1]:
# august_df = pd.read_csv('airbnb/august_calendar.csv')
# august_df

In [2]:
# listing_with_nights = august_df.groupby(['listing_id']).sum().reset_index(drop=False)[['listing_id', 'maximum_nights']]
# listing_with_nights

In [6]:
# merged_data = airbnb_data.merge(listing_with_nights, left_on='id', right_on='listing_id', how='right')
# merged_data.drop(columns='listing_id', inplace=True)
# merged_data

In [9]:
# august_cal = pd.read_csv('airbnb/august_calendar.csv')
# august_cal['date'] = pd.to_datetime(august_cal['date']).dt.strftime('%Y/%m/%d %s')
# august_cal

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,87671,2021-08-06 00:00:00,t,$227.00,$227.00,7.0,1125.0
1,87671,2021-08-07 00:00:00,t,$227.00,$227.00,7.0,1125.0
2,87671,2021-08-08 00:00:00,t,$227.00,$227.00,7.0,1125.0
3,87671,2021-08-09 00:00:00,t,$227.00,$227.00,7.0,1125.0
4,87671,2021-08-17 00:00:00,t,$227.00,$227.00,7.0,1125.0
...,...,...,...,...,...,...,...
207792,50897989,2021-08-27 00:00:00,t,$139.00,$139.00,1.0,365.0
207793,50897989,2021-08-28 00:00:00,t,$139.00,$139.00,1.0,365.0
207794,50897989,2021-08-29 00:00:00,t,$139.00,$139.00,1.0,365.0
207795,50897989,2021-08-30 00:00:00,t,$129.00,$129.00,1.0,365.0


In [12]:
# from scipy.spatial import distance

# np.array(only_places.loc[0]).reshape(-1, 1)

array([[-155.43259],
       [  20.04266]])

In [13]:
# np.array(usapoi_data[['longitude', 'latitude']].loc[0]).reshape(-1, 1)

array([[-159.49246],
       [  21.88575]])

In [14]:
# distance.cdist(np.array(only_places.loc[0]).reshape(-1, 1), 
#                np.array(usapoi_data[['longitude', 'latitude']].loc[0]).reshape(-1, 1), metric='euclidean')

array([[  4.05987, 177.31834],
       [179.53512,   1.84309]])

In [15]:
# from sklearn.metrics.pairwise import euclidean_distances

# euclidean_distances(np.array(only_places.loc[0]).reshape(-1, 1), 
#                np.array(usapoi_data[['longitude', 'latitude']].loc[0]).reshape(-1, 1))

array([[  4.05987, 177.31834],
       [179.53512,   1.84309]])