In [52]:
import requests
import pandas as pd
import h3
import json

- In this notebook we will firstly collect points of interest (poi) data.
- In the second step we will calculate the number of points of interest in different categories for certain areas and for each trip.


# Points of interest data collection

We will use [OpenStreetMap (OSM)](https://en.wikipedia.org/wiki/OpenStreetMap) to collect data about different points of interest. OpenStreetMap is a collaborative free of charge tool that provides geographical data around the world. In general, data provided by OpenStreetMap is considered to be fairly good and is often comparared to that of Google Maps.

OpenStreetMap divided the area considered in our research into three cities Los Angeles, Santa Monica and Burbank. Hence, in the following we will collect data for all three of them using Overpass API, which was created to collect custom data for selected parts of the world. The queries that we wrote for Overpass API can be found under /utils/overpass_meta.py. These are based on the various map features documented in the [wiki](https://wiki.openstreetmap.org/wiki/Map_features#Public_Transport). Also queries can be tested using a tool called [overpass turbo](https://overpass-turbo.eu/), which directly visualises the results.

In [53]:
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'utils')))
from overpass_meta import get_meta_data

overpass_meta = get_meta_data()

In [54]:
def poi_file_exists(category):
    return os.path.isfile(overpass_meta[category]["filepath"])


def read_poi_file(category):
    with open(overpass_meta[category]["filepath"]) as f:
        data = json.load(f)
    return data


def save_poi_file(category, data):
    with open(overpass_meta[category]["filepath"], "w") as f:
        json.dump(data, f)


def fetch_poi_data(category):
    response = requests.get(
        "http://overpass-api.de/api/interpreter",
        params={"data": overpass_meta[category]["query"]},
    )
    data = response.json()
    save_poi_file(category, data)
    return data


def get_poi_data(category):
    if poi_file_exists(category):
        return read_poi_file(category)
    return fetch_poi_data(category)


In [55]:
sustenance_data = get_poi_data('sustenance')
sustenance_df = pd.DataFrame(sustenance_data["elements"])
sustenance_df.head(2)

Unnamed: 0,type,id,lat,lon,tags
0,node,72448982,34.076217,-118.21602,"{'amenity': 'fast_food', 'cuisine': 'japanese'..."
1,node,72448995,34.076693,-118.216013,"{'amenity': 'fast_food', 'cuisine': 'burger', ..."


In [56]:
sustenance_df['category'] = 'sustenance'
sustenance_df['amenity'] = sustenance_df['tags'].apply(lambda tags: tags['amenity'])
sustenance_df.head(2)

Unnamed: 0,type,id,lat,lon,tags,category,amenity
0,node,72448982,34.076217,-118.21602,"{'amenity': 'fast_food', 'cuisine': 'japanese'...",sustenance,fast_food
1,node,72448995,34.076693,-118.216013,"{'amenity': 'fast_food', 'cuisine': 'burger', ...",sustenance,fast_food


In [57]:
public_transport_data = get_poi_data('public_transport')
public_transport_df = pd.DataFrame(public_transport_data["elements"])
public_transport_df['category'] = 'public_transport'
public_transport_df.head(2)

Unnamed: 0,type,id,lat,lon,tags,category
0,node,16298122,34.140526,-118.361246,"{'name': 'Universal City', 'network': 'LACMTA'...",public_transport
1,node,18660357,34.090696,-118.291703,"{'name': 'Vermont/Santa Monica', 'network': 'L...",public_transport


In [58]:
education_data = get_poi_data('education')
education_df = pd.DataFrame(education_data["elements"])
education_df["category"] = "education"
education_df["amenity"] = education_df["tags"].apply(lambda tags: tags["amenity"])
education_df.head(2)

Unnamed: 0,type,id,lat,lon,tags,category,amenity
0,node,243805625,33.959419,-118.417117,"{'amenity': 'library', 'created_by': 'Potlatch...",education,library
1,node,344327189,34.258253,-118.301348,"{'addr:state': 'CA', 'amenity': 'library', 'el...",education,library


In [59]:
arts_and_culture_data = get_poi_data('arts_and_culture')
arts_and_culture_df = pd.DataFrame(arts_and_culture_data["elements"])
arts_and_culture_df['category'] = 'arts_and_culture'
arts_and_culture_df['amenity'] = arts_and_culture_df['tags'].apply(lambda tags: tags['amenity'])
arts_and_culture_df.head(2)

Unnamed: 0,type,id,lat,lon,tags,category,amenity
0,node,368167434,34.084167,-118.482222,"{'addr:state': 'CA', 'amenity': 'arts_centre',...",arts_and_culture,arts_centre
1,node,368167436,34.129722,-118.209722,"{'addr:state': 'CA', 'amenity': 'arts_centre',...",arts_and_culture,arts_centre


In [60]:
sports_data = get_poi_data('sports')
sports_df = pd.DataFrame(sports_data["elements"])
sports_df['category'] = 'sports'
sports_df.head(2)

Unnamed: 0,type,id,lat,lon,tags,category
0,node,358826475,34.047789,-118.334798,"{'ele': '46', 'gnis:county_id': '037', 'gnis:c...",sports
1,node,358826622,34.166808,-118.485205,"{'ele': '213', 'gnis:county_id': '037', 'gnis:...",sports


In [61]:
poi_df = pd.concat([
    sustenance_df,
    public_transport_df,
    education_df,
    arts_and_culture_df,
    sports_df
])
poi_df.head(2)

Unnamed: 0,type,id,lat,lon,tags,category,amenity
0,node,72448982,34.076217,-118.21602,"{'amenity': 'fast_food', 'cuisine': 'japanese'...",sustenance,fast_food
1,node,72448995,34.076693,-118.216013,"{'amenity': 'fast_food', 'cuisine': 'burger', ...",sustenance,fast_food


# Points of interest data processing

In [62]:
resolution = 8

def convert_to_hex(latitude, longitude):
    return h3.geo_to_h3(
        lat=latitude,
        lng=longitude,
        resolution=resolution
    )

In [63]:
poi_df["hex"] = poi_df.apply(
    lambda poi: convert_to_hex(poi["lat"], poi["lon"]), axis=1
)
poi_df = poi_df.drop(columns={"type", "id", "tags"})
poi_df.head(2)


Unnamed: 0,lat,lon,category,amenity,hex
0,34.076217,-118.21602,sustenance,fast_food,8829a1d73dfffff
1,34.076693,-118.216013,sustenance,fast_food,8829a1d73dfffff


In [64]:
all_hexagons_with_poi = poi_df.groupby(['hex', 'category']).size().to_frame()
all_hexagons_with_poi = all_hexagons_with_poi.reset_index()
all_hexagons_with_poi = all_hexagons_with_poi.rename(columns={0: 'number of poi'})
all_hexagons_with_poi.head(2)

Unnamed: 0,hex,category,number of pois
0,881f9c344dfffff,sustenance,1
1,882664501bfffff,sustenance,1


In [65]:
trips_df = pd.read_pickle('../00_data/trips.pkl')

trips_df["start_hex"] = trips_df.apply(
    lambda trip: convert_to_hex(trip["start_latitude"], trip["start_longitude"]), axis=1
)

trips_df["end_hex"] = trips_df.apply(
    lambda trip: convert_to_hex(trip["end_latitude"], trip["end_longitude"]), axis=1
)

In [66]:
number_of_unique_hexagons = pd.concat([trips_df['start_hex'], trips_df['end_hex']]).unique().size
print(
    f"We have identified {number_of_unique_hexagons} h3 hexagons with resolution {resolution}."
)

We have identified 75 h3 hexagons with resolution 8.


In [67]:
# create a dataframe representing H3 hexagons in LA
# note: only hexagons are depicted where at least one trip started or ended
hexagons_df = pd.DataFrame()
hexagons_df['hex'] = pd.concat([trips_df['start_hex'], trips_df['end_hex']]).unique()
hexagons_df.head(2)

Unnamed: 0,hex
0,8829a1d757fffff
1,8829a1d755fffff


In [68]:
hexagons_df["hex_and_neighbors"] = hexagons_df.apply(
    lambda row: list(h3.k_ring(row["hex"], 1)), axis=1
)
hexagons_df.head(2)

Unnamed: 0,hex,hex_and_neighbors
0,8829a1d757fffff,"[8829a1d70bfffff, 8829a1d755fffff, 8829a1d751f..."
1,8829a1d755fffff,"[8829a1d709fffff, 8829a1d75dfffff, 8829a1d70bf..."


In [69]:
hexagons_df['hex_and_neighbors'][0]

['8829a1d70bfffff',
 '8829a1d755fffff',
 '8829a1d751fffff',
 '8829a1d71dfffff',
 '8829a1d753fffff',
 '8829a1d719fffff',
 '8829a1d757fffff']

In [70]:
def calculate_poi(hex_and_neighbors, category):
    return all_hexagons_with_poi[
        ((all_hexagons_with_poi["hex"].isin(hex_and_neighbors))
        & (all_hexagons_with_poi["category"] == category))
    ]["number of poi"].sum()

In [71]:
hexagons_df["sustenance_poi"] = hexagons_df["hex_and_neighbors"].apply(
    lambda row: calculate_poi(row, "sustenance")
)
hexagons_df["public_transport_poi"] = hexagons_df["hex_and_neighbors"].apply(
    lambda row: calculate_poi(row, "public_transport")
)
hexagons_df["education_poi"] = hexagons_df["hex_and_neighbors"].apply(
    lambda row: calculate_poi(row, "education")
)
hexagons_df["arts_and_culture_poi"] = hexagons_df["hex_and_neighbors"].apply(
    lambda row: calculate_poi(row, "arts_and_culture")
)
hexagons_df["sports_poi"] = hexagons_df["hex_and_neighbors"].apply(
    lambda row: calculate_poi(row, "sports")
)
hexagons_df.head(2)


Unnamed: 0,hex,hex_and_neighbors,sustenance_pois,public_transport_pois,education_pois,arts_and_culture_pois,sports_pois
0,8829a1d757fffff,"[8829a1d70bfffff, 8829a1d755fffff, 8829a1d751f...",250,176,6,8,7
1,8829a1d755fffff,"[8829a1d709fffff, 8829a1d75dfffff, 8829a1d70bf...",115,88,5,4,3


We will add the newly acquired data to the trips dataset. Performing cluster analysis with these new features might yield intersting results.

In [72]:
trips_df = pd.merge(trips_df, hexagons_df, left_on="start_hex", right_on="hex")
trips_df = trips_df.drop(columns={"hex", "hex_and_neighbors"})
trips_df = trips_df.rename(columns={
    "sustenance_poi" : "sustenance_poi_start",
    "public_transport_poi" : "public_transport_poi_start",
    "education_poi" : "education_poi_start",
    "arts_and_culture_poi" : "arts_and_culture_poi_start",
    "sports_poi" : "sports_poi_start",
})
trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,...,sustenance_pois_end,public_transport_pois_end,education_pois_end,arts_and_culture_pois_end,sports_pois_end,sustenance_pois_start,public_transport_pois_start,education_pois_start,arts_and_culture_pois_start,sports_pois_start
0,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,0 days 00:32:00,34.05194,...,270,180,5,9,11,250,176,6,8,7
1,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,5860,Walk-up,Main & 1st,Broadway & 9th,0 days 00:30:00,34.05194,...,270,180,5,9,11,250,176,6,8,7


In [73]:
trips_df = pd.merge(trips_df, hexagons_df, left_on="end_hex", right_on="hex")
trips_df = trips_df.drop(columns={"hex", "hex_and_neighbors"})
trips_df = trips_df.rename(columns={
    "sustenance_poi" : "sustenance_poi_end",
    "public_transport_poi" : "public_transport_poi_end",
    "education_poi" : "education_poi_end",
    "arts_and_culture_poi" : "arts_and_culture_poi_end",
    "sports_poi" : "sports_poi_end",
})
trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,...,sustenance_pois_start,public_transport_pois_start,education_pois_start,arts_and_culture_pois_start,sports_pois_start,sustenance_pois_end,public_transport_pois_end,education_pois_end,arts_and_culture_pois_end,sports_pois_end
0,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,0 days 00:32:00,34.05194,...,250,176,6,8,7,270,180,5,9,11
1,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,5860,Walk-up,Main & 1st,Broadway & 9th,0 days 00:30:00,34.05194,...,250,176,6,8,7,270,180,5,9,11


In [74]:
poi_df.to_pickle('../00_data/poi.pkl')
hexagons_df.to_pickle('../00_data/hexagons.pkl')
trips_df.to_pickle('../00_data/trips.pkl')