In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Read 99 post office CSV data (translated into English and extracted just columns of interest)
dfpo2 = pd.read_csv('2019-07_post-office-Hokkaido1e.csv',
    dtype={'po_code':str,'po_name':str,'longitude':np.float64,'latitude':np.float64})
dfpo2.head(15)

In [None]:
# Calculate the epicenter of 99 locations
epi_longitude,epi_latitude = dfpo2[['longitude','latitude']].mean().tolist()

In [None]:
# Display a Hokkaido map
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you don't have Forium module
import folium # map rendering library

#Display an initial map
map3 = folium.Map(location=[epi_latitude, epi_longitude], zoom_start=7)
map3

In [None]:
# Display 99 post office locations on the map
for lat, lon, po_name in zip(dfpo2['latitude'],dfpo2['longitude'],dfpo2['po_name']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=po_name,
        color='yellow',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7).add_to(map3)
map3

In [None]:
import operator

In [None]:
# create a data frame excluding those with "Sapporo" in post office name
dfpo4 = dfpo2[
    dfpo2['po_name'].apply(str.__contains__, str, ['Sapporo']).apply(operator.__not__)
]
dfpo4.head()

In [None]:
# then, excluding those with "Otaru"
dfpo5 = dfpo4[
    dfpo4['po_name'].apply(str.__contains__, str, ['Otaru']).apply(operator.__not__)
]
dfpo5.head()

In [None]:
# then, excluding those with "Yoichi"
dfpo6 = dfpo5[
    dfpo5['po_name'].apply(str.__contains__, str, ['Yoichi']).apply(operator.__not__)
]
dfpo6.head()

In [None]:
# then, excluding those with "Muroran"
dfpo7 = dfpo6[
    dfpo6['po_name'].apply(str.__contains__, str, ['Muroran']).apply(operator.__not__)
]
dfpo7.head()

In [None]:
# then, excluding those with "Tomakomai"
dfpo8 = dfpo7[
    dfpo7['po_name'].apply(str.__contains__, str, ['Tomakomai']).apply(operator.__not__)
]
dfpo8.head(15)

In [None]:
# Let's see the number of post offices left.
dfpo8.shape[0]

In [None]:
import requests # library to handle requests

In [None]:
# Replace Foursquare ID and secret key with yours
# --- replaced by dummy strings before uploading to GitHub
CLIENT_ID = 'YOUR_CLIENT_ID' # 
CLIENT_SECRET = 'YOUR CLIENT_SECRET' # 
VERSION = '20180605' # Foursquare API version

In [None]:
# Trending places around 78 not-so-popular places
# It may take long if radius and LIMIT are large.
# Sometimes throws an KeyError exception:
#   because of malformed response
#   due to intermittent network errors or errors at the data source.
# Retrying usually works.

LIMIT = 100 # limit of number of venues returned by Foursquare API (max: 100)
radius = 1000 # define radius (max: 2000 = 2km)
# create URL trending venues
for index, row in dfpo8.iterrows():
    url = 'https://api.foursquare.com/v2/venues/trending?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, row['latitude'], row['longitude'], radius, LIMIT)
    results = requests.get(url).json()
    #List up trend venues
    for ix, trend_venues in enumerate(results['response']['venues']):
        print (row['po_name'], ":", trend_venues['name'], ":", trend_venues['categories'][0]['name'])

In [None]:
# Print out 4 east-end locations.
dfpo8.sort_values(['longitude'],ascending=False).head(4)

In [None]:
# Print out 2 west-end locations.
dfpo8.sort_values(['longitude']).head(2)

In [None]:
# Print out 2 north-end locations.
dfpo8.sort_values(['latitude'],ascending=False).head(2)

In [None]:
# Display 78 post office locations on the map
epi_longitude,epi_latitude = dfpo8[['longitude','latitude']].mean().tolist()
map4 = folium.Map(location=[epi_latitude, epi_longitude], zoom_start=7)
for lat, lon, po_name in zip(dfpo8['latitude'],dfpo8['longitude'],dfpo8['po_name']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=po_name,
        color='yellow',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7).add_to(map4)
map4

In [None]:
# Define a method to calculate the distance between 2 places, based on their latitudes and longitudes.
from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
def dist_lat_long(latd1, lond1, latd2, lond2):
    # unit for latitude/longitude: degrees
    R = 6373.0
    # convert to radian
    latr1 = radians(latd1)
    lonr1 = radians(lond1)
    latr2 = radians(latd2)
    lonr2 = radians(lond2)
    dlonr = lonr2 - lonr1
    dlatr = latr2 - latr1

    a = sin(dlatr/2)**2 + cos(latr1) * cos(latr2) * sin(dlonr/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R*c

In [None]:
# Calculate the distance #1: 
dest1 = 'Hidaka Meguro'
origin1 = 'Erimo Misaki'
lond1, latd1 = dfpo8[dfpo8['po_name']==dest1][['longitude','latitude']].values[0].tolist()
# print (lond1, latd1) 
lond2, latd2 = dfpo8[dfpo8['po_name']==origin1][['longitude','latitude']].values[0].tolist()
print("Distance from {} to {}: {:.1f} km".format(
    origin1, dest1, dist_lat_long(latd1, lond1, latd2, lond2)))

In [None]:
places_to_visit = ['Horo','Hamamasu','Hidaka Meguro','Shoya','Erimo Misaki','Erimo','Shimamaki',
                  'Honme','Niwan','Biratori','Nakoma','Konbu']

In [None]:
# Calculate the distanace #2: Nakoma from Major post offices
dest = 'Nakoma'
dfNakoma = dfpo8[dfpo8['po_name']==dest][['latitude','longitude']]

In [None]:
dfNakoma.values[0].tolist()

In [None]:
major_po = ['Sapporo Chuo', 'Otaru', 'Muroran', 'Tomakomai']
dfmpo = dfpo2[dfpo2['po_name'].isin(major_po)]

In [None]:
dfmpo[['latitude','longitude']].values

In [None]:
# Wrapper: array-based distance calculator
def dist_lat_long1(X1):
    # parameters as arrays
    # uses dfNakoma data frame defined outside
    latN, lonN = dfNakoma.values[0].tolist()
    return dist_lat_long(X1[0], X1[1], latN, lonN)

In [None]:
df_dist_mpo = dfmpo[['latitude','longitude']].apply(dist_lat_long1, axis=1, result_type='expand')
df_dist_mpo

In [None]:
dfmpo2 = dfmpo.copy()
dfmpo2['dist']=df_dist_mpo
dfmpo2

In [None]:
df_ptv = dfpo2[dfpo2['po_name'].isin(places_to_visit)]
df_ptv

In [None]:
df_nearby1 = pd.DataFrame(columns=['Post Office','Venue','Category'])

In [None]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 2000 # define radius (max: 2000 = 2km)
for index, row in df_ptv.iterrows(): # places_to_visit
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, row['latitude'], row['longitude'], radius, LIMIT)
    results = requests.get(url).json()
    for ix1, item in enumerate(results['response']['groups'][0]['items']):
        df_nearby1 = df_nearby1.append(
            pd.Series(
                [row['po_name'],item['venue']['name'],item['venue']['categories'][0]['name']],
                index=['Post Office','Venue','Category']
            ), ignore_index=True
        )

In [None]:
# Print the number of nearby locations around 2km radius of places to visit in Hokkaido.
df_nearby1.shape[0]

In [None]:
df_nearby1

In [None]:
# Sort by the number of neighborhood venues
#    create a data frame: the number of venues for each post office
df_nearby3 = df_nearby1.groupby(['Post Office'])['Venue'].count().reset_index()
df_nearby3.sort_values(['Venue'],ascending=False)

In [None]:
# Looking into Foursquare venue categories
import json

In [None]:
# create URL venue categories
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION)
url # display URL

In [None]:
results = requests.get(url).json()

In [None]:
# Create a data frame for all the categories.
# Tried out the following code with increasing number of levels and found out there are five category levels.
# Only the final result is included here.
df_cat1 = pd.DataFrame(columns=['Level', 'Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5'])

# Populate the data frame.
for ix1, js1 in enumerate(results['response']['categories']):
    df_cat1 = df_cat1.append(
        pd.Series([1,js1['name'],'','','',''],
                  index=df_cat1.columns),ignore_index=True)
    for ix2, js2 in enumerate(js1['categories']):
        df_cat1 = df_cat1.append(
            pd.Series([2,js1['name'],js2['name'],'','',''],
                      index=df_cat1.columns),ignore_index=True)
        for ix3, js3 in enumerate(js2['categories']):
            df_cat1 = df_cat1.append(
                pd.Series([3,js1['name'],js2['name'],js3['name'],'',''],
                      index=df_cat1.columns),ignore_index=True)
            for ix4, js4 in enumerate(js3['categories']):
                df_cat1 = df_cat1.append(
                    pd.Series([4,js1['name'],js2['name'],js3['name'],js4['name'],''],
                      index=df_cat1.columns),ignore_index=True)
                for ix5, js5 in enumerate(js4['categories']):
                    df_cat1 = df_cat1.append(
                        pd.Series([5,js1['name'],js2['name'],js3['name'],js4['name'],js5['name']],
                          index=df_cat1.columns),ignore_index=True)

In [None]:
#Print level 5 category names
df_cat1[df_cat1['Level']==5]

In [None]:
# Wrapper: array-based distance calculator to be used in the DBSCAN metrics calculation
def dist_lat_long2(X1, X2):
    # parameters as arrays
    return dist_lat_long(X1[0], X1[1], X2[0], X2[1])

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
X = dfpo8[['latitude','longitude']].values
X[0:5]

In [None]:
# Tried a couple of epsilons = distance (km) between two places. 15km gives us a good clustering result.
# 10km, 12km, 20km are also OK.
epsilon = 15.0 # a specified radius that if includes enough number of points within, we call it dense area  
minimumSamples = 2 # determine the minimum number of data points we want in a neighborhood to define a cluster.
db = DBSCAN(eps=epsilon, min_samples=minimumSamples,metric=dist_lat_long2).fit(X)
labels = db.labels_
labels

In [None]:
dfpo9 = dfpo8.copy()
dfpo9.loc[:,'label'] = labels

In [None]:
labels.max()

In [None]:
dfpo9.head()

In [None]:
colors = ['black', 'gray', 'darkblue', 'blue', 'darkpurple', 
    'purple', 'darkgreen', 'green', 'orange', 'darkred', 'red',
    'pink', 'lightgray', 'lightred', 'beige', 'cadetblue', 'white', 'lightblue', 'lightgreen', 'blue']

In [None]:
# Display 78 post office locations on the map
epi_longitude,epi_latitude = dfpo8[['longitude','latitude']].mean().tolist()
map5 = folium.Map(location=[epi_latitude, epi_longitude], zoom_start=7)
for lat, lon, po_name, lbl in zip(dfpo9['latitude'],dfpo9['longitude'],dfpo9['po_name'],dfpo9['label']):
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=po_name,
        color=colors[lbl+1],
        fill=True,
        fillcolor=colors[lbl+1],
        fill_opacity=0.7).add_to(map5)
map5

In [None]:
for epsil in range(3,27,3):
    db = DBSCAN(eps=epsil, min_samples=2, metric=dist_lat_long2).fit(X)
    labels = db.labels_
    print ('Epsilon={:3d}, Number of clusters={:3d}, Number of outliers={:3d}'.format(
        epsil, labels.max()+1, len(labels[labels==-1])))
    print (labels)

In [None]:
dfDBSCAN = pd.DataFrame(columns=['Epsilon','NumCluster','NumOutlier'],dtype=np.int8)
for epsil in range(2,25):
    db = DBSCAN(eps=epsil, min_samples=2, metric=dist_lat_long2).fit(X)
    labels = db.labels_
    #print (epsil, labels.max()+1, len(labels[labels==-1]))
    dfDBSCAN = dfDBSCAN.append(pd.Series([epsil, labels.max()+1, len(labels[labels==-1])],
                               index=dfDBSCAN.columns),ignore_index=True)

In [None]:
%matplotlib inline
dfDBSCAN.plot.line(x='Epsilon',xlim=[0,25],ylim=[0,80])