In [1]:
import json
import math
import geopy
import configparser
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium.plugins import FloatImage

import pyproj

import shapely.geometry
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from tqdm.notebook import tqdm
%matplotlib inline
tqdm.pandas('pandas')
config = configparser.ConfigParser()
config.read_file(open('credentials.env'))

  from pandas import Panel


In [2]:
# the center of our geographic area
center = (-73.80678292908537, 42.67626127679524)

In [3]:
def lonlat_to_xy(lon, lat):
    proj_latlon = pyproj.Proj(proj='latlong', datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=18, datum='WGS84')
    xy = pyproj.transform(proj_latlon, proj_xy, lon, lat)
    return xy[0], xy[1]


def xy_to_lonlat(x, y):
    proj_latlon = pyproj.Proj(proj='latlong', datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=18, datum='WGS84')
    lonlat = pyproj.transform(proj_xy, proj_latlon, x, y)
    return lonlat[0], lonlat[1]


def calc_xy_distance(x1, y1, x2, y2):
    dx = x2 - x1
    dy = y2 - y1
    return math.sqrt(dx * dx + dy * dy)


print('Coordinate transformation check')
print('-------------------------------')
print('Capital District longitude={}, latitude={}'.format(*center))
x, y = lonlat_to_xy(*center)
print('Capital District UTM X={}, Y={}'.format(x, y))
lo, la = xy_to_lonlat(x, y)
print('Capital District  longitude={}, latitude={}'.format(lo, la))

Coordinate transformation check
-------------------------------
Capital District longitude=-73.80678292908537, latitude=42.67626127679524
Capital District UTM X=597766.7852150269, Y=4725555.307055264
Capital District  longitude=-73.80678292908537, latitude=42.676261276795245


In [34]:
def generate_hex_grid(center):
    center_x, center_y = lonlat_to_xy(
        *center)  # City center in Cartesian coordinates

    k = math.sqrt(3) / 2  # Vertical offset for hexagonal grid cells
    x_min = center_x - 50000
    x_step = 2500
    y_min = center_y - 50000 - (int(126 / k) * k * 2500 - 100000) / 2
    y_step = 2500 * k

    latitudes = []
    longitudes = []
    distances_from_center = []
    xs = []
    ys = []
    for i in range(0, int(125 / k)):
        y = y_min + i * y_step
        x_offset = 1250 if i % 2 == 0 else 0
        for j in range(0, 125):
            x = x_min + j * x_step + x_offset
            distance_from_center = calc_xy_distance(center_x, center_y, x, y)
            if (distance_from_center <= 50050):
                lon, lat = xy_to_lonlat(x, y)
                latitudes.append(lat)
                longitudes.append(lon)
                distances_from_center.append(distance_from_center)
                xs.append(x)
                ys.append(y)
    neighborhoods = pd.DataFrame({
        'latitude': latitudes,
        'longitude': longitudes,
        'x': xs,
        'y': ys,
        'distance': distances_from_center
    })
    return neighborhoods

try:
    neighborhoods = pd.read_csv('neighborhoods.csv', index_col=0)
except FileNotFoundError as e:
    neighborhoods = generate_hex_grid(center)
    neighborhoods.to_csv('neighborhoods.csv')

neighborhoods.shape

(1455, 5)

In [35]:
try:
    del hex_map
except:
    pass
hex_map = folium.Map(location=center[::-1], zoom_start=10)

folium.Circle(center[::-1], radius=50050).add_to(hex_map)

for _, lat, long in neighborhoods[['latitude', 'longitude']].itertuples():
    label = f'{lat, long}'
    label = folium.Popup(label, parse_html=True)
    folium.Circle([lat, long],
                  radius=1250,
                  popup=label,
                  color='red',
                  opacity=.1,
                  fill=True,
                  fill_color='red',
                  fill_opacity=0.01,
                  parse_html=False).add_to(hex_map)

hex_map

The map output is 
![image.png](attachment:image.png)

In [40]:
url = 'https://api.foursquare.com/v2/venues/explore'

fsq_params = {
    'client_id': config.get('foursquare', 'client_id'),
    'client_secret': config.get('foursquare', 'client_secret'),
    'v': '20200222'
}


def query_fsquare_for_nearby_venues(lat,
                                    long,
                                    limit=100,
                                    radius=2500,
                                    credentials=fsq_params,
                                    check_cache=False):
    if check_cache:
        cache = pd.read_csv(check_cache,
                            usecols=['center_lat',
                                     'center_long']).drop_duplicates()
        in_cache = cache[(cache['center_lat'] == lat)
                         & (cache['center_long'] == long)]

        if in_cache.empty:
            credentials['ll'] = '{},{}'.format(lat, long),
            credentials['limit'] = limit,
            credentials['radius'] = radius
            resp = requests.get(url=url, params=credentials)
            return {'lat': lat, 'long': long, 'resp': resp}
    else:
        credentials['ll'] = '{},{}'.format(lat, long),
        credentials['limit'] = limit,
        credentials['radius'] = radius
        resp = requests.get(url=url, params=credentials)
        return {'lat': lat, 'long': long, 'resp': resp}


def normalize_response(response):
    results = response['resp'].json()
    results = pd.json_normalize(results['response']['groups'][0]['items'])
    try:
        results = results[[
            'venue.name', 'venue.location.lat', 'venue.location.lng',
            "venue.categories"
        ]]
        results.iloc[:, -1] = results['venue.categories'].map(
            lambda x: x[0]['shortName'])
    except KeyError as e:
        #mock response venue
        results = pd.DataFrame(
            {
                'venue.name': 'None',
                'venue.location.lat': response['lat'],
                'venue.location.lng': response['long'],
                'venue.categories': 'None'
            },
            index=[0])
    results['center_lat'] = response['lat']
    results['center_long'] = response['long']
    results.columns = [
        'venue', 'latitude', 'longitude', 'category', 'center_lat',
        'center_long'
    ]
    return results


def cache_successful_response(successful_response_data):
    try:  #look for existing cache
        previously_cached_data = pd.read_csv('response200.csv', index_col=0)
        new_records = len(successful_response_data)
        old_records = len(previously_cached_data)
        results = pd.concat([successful_response_data, previously_cached_data])
        results.drop_duplicates(inplace=True)
        non_duplicates = len(results) - old_records
        results.to_csv('response200.csv')
        return old_records, new_records, non_duplicates

    except FileNotFoundError as e:
        tqdm.write(str(e))
        successful_response_data.to_csv('response200.csv')
        return 0, 0, 0

In [41]:

fails = []
check_cache = False

for ix, lat, long in tqdm(
        list(neighborhoods[['latitude', 'longitude']].itertuples())):
    q = query_fsquare_for_nearby_venues(lat, long, check_cache=check_cache)
    if q is None:
        next
    elif q['resp'].status_code == 200:
        success = normalize_response(q)
        old, new, nondupe = cache_successful_response(success)
        #tqdm.write('{} records in cache. Comparing {}. {} new records'.format(old, new, nondupe))
    elif q['resp']:
        tqdm.write('{} {} {}'.format(q['lat'], q['long'],
                                     q['resp'].status_code))
        fails.append((lat, long, q['resp'].status_code))
    check_cache = "response200.csv"

HBox(children=(FloatProgress(value=0.0, max=1455.0), HTML(value='')))




In [45]:
neighborhoods = pd.read_csv('response200.csv', index_col=0)
neighborhoods, fails

(                          venue   latitude  longitude  \
 0           Farmer's Daughters'  43.100436 -73.651409   
 1   Saratoga Photobooth Company  43.107583 -73.667347   
 2                 Schuyler Park  43.100723 -73.653101   
 3              HomeOwner-Helper  43.100733 -73.698135   
 4           Lynn's Country Cafe  43.099279 -73.653703   
 ..                          ...        ...        ...   
 2           Wolff's Dining Hall  42.222694 -73.950600   
 3          Wolff's Maple Breeze  42.222537 -73.950588   
 4          Ray's Appliance Inc.  42.222976 -73.940493   
 5               K&D Repair Shop  42.244981 -73.978353   
 6      Grand Arbor Tree Service  42.243610 -73.931143   
 
                                              category  center_lat  \
 0                                           Ice Cream   43.113490   
 1                                      Event Services   43.113490   
 2                                      Baseball Field   43.113490   
 3                    

In [46]:
# this file contains a dictionary/json map reducing the number of business c
# categories
with open('category_mapping.json') as f:
    category_map = json.loads(f.readline())

In [47]:
neighborhoods = pd.read_csv('response200.csv')
neighborhoods['gen_cat'] = neighborhoods['category'].replace(category_map)
venues = neighborhoods[[
    'venue', 'latitude', 'longitude', 'category', 'gen_cat'
]].drop_duplicates()
venues = venues[venues['category'] != 'None']

In [None]:
# This is the outermost neighborhoods in our area of interest.
# It will be important later.
furthest = neighborhoods.distance.max()  #50042.95030271565
outer_rim = neighborhoods[neighborhoods['distance'] > furthest - 2475]

In [None]:
try:
    del capital_district_locations
except:
    pass
capital_district_locations = folium.Map(center[::-1], zoom_start=9)
for _, _, lat, long, category, generic, in tqdm(list(venues.itertuples())):
    label = f'{generic}, {category}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, long],
                        radius=2,
                        popup=label,
                        color='blue',
                        opacity=.15,
                        fill=True,
                        fill_color='lightblue',
                        fill_opacity=0.1,
                        parse_html=False).add_to(capital_district_locations)

capital_district_locations

In [None]:
def query_fsquare_for_grocery_stores(latitude,
                                     longitude,
                                     limit=100,
                                     radius=30000,
                                     query='groceries',
                                     credentials=fsq_params):

    credentials['ll'] = '{},{}'.format(lat, long),
    credentials['limit'] = limit,
    credentials['radius'] = radius
    credentials['query'] = query
    resp = requests.get(url=url, params=credentials)
    return {'lat': lat, 'long': long, 'resp': resp}

In [None]:
groceries = venues[venues['gen_cat'] == 'Groceries']
first_pass = True
for neighborhood in outer_rim:
    resp = query_fsquare_for_grocery_stores(**outer_rim.iloc[1, :2].to_dict())
    resp = normalize_response(resp)
    resp = resp[(resp['category'] == 'Supermarket') |
                (resp['category'] == 'Grocery Store')]
    resp['gen_cat'] = 'Groceries'
    if first_pass:
        beyond_border_grocery_stores = resp
    else:
        beyond_border_grocery_stores = pd.concat(
            [beyond_border_grocery_stores, resp])
        first_pass = False

beyond_border_grocery_stores = beyond_border_grocery_stores[[
    'latitude', 'longitude', 'gen_cat'
]]
beyond_border_grocery_stores = beyond_border_grocery_stores
groceries = groceries[['latitude', "longitude", "gen_cat"]]
groceries = pd.concat([beyond_border_grocery_stores,
                       groceries]).drop_duplicates()

In [None]:
for _, lat, long, _ in tqdm(list(groceries.itertuples())):
    folium.CircleMarker([lat, long],
                        radius=10,
                        color='red',
                        opacity=.5,
                        fill=True,
                        fill_color='lightblue',
                        fill_opacity=0.1,
                        parse_html=False).add_to(capital_district_locations)

capital_district_locations

In [None]:
profile = neighborhoods[['center_lat', 'center_long'
                         ]].merge(pd.get_dummies(neighborhoods.gen_cat),
                                  left_index=True,
                                  right_index=True)
profile

In [None]:
profile_means = profile.groupby(['center_lat', 'center_long']).mean()

profile_sums = profile.groupby(['center_lat', 'center_long']).sum()
profile_sums['totals'] = profile_sums.apply(lambda x: x.sum(), axis=1)
scored_profile = profile_means
scored_profile['Size'] = profile_sums['totals']
#scored_profile.drop(columns=['Groceries'])
scored_profile.iloc[:, :] = RobustScaler().fit_transform(
    X=scored_profile.iloc[:, :])

X = scored_profile.values

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
ks = list(range(1, 15))
sse = []
for k in ks:
    kmm = KMeans(n_clusters=k)
    kmm.fit(X)

    sse.append(kmm.inertia_)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(ks, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance')
plt.show()

In [None]:
kmm = KMeans(4)
kmm.fit(X)
y_pred = kmm.predict(X)
scored_profile['cluster'] = y_pred

In [None]:
x_scale = scored_profile.max().max()  # for setting a common x-axis scale

In [None]:
clusters = list(range(scored_profile.cluster.nunique()))

cluster_box_plots = {n: n for n in clusters}
for cluster in clusters:
    cluster_summary = scored_profile[scored_profile['cluster'] == cluster]
    cluster_summary = cluster_summary.drop(columns=['cluster'])
    top5_filter = cluster_summary.mean().sort_values(ascending=False)[:5].index
    cluster_top5 = cluster_summary[top5_filter]

    title = "Cluster_{}".format(cluster)
    plt.figure(figsize=(4, 2.5))
    plt.tight_layout()
    cluster_box_plots[cluster] = sns.boxplot(data=cluster_top5,
                                             orient='h',
                                             palette="Set2")
    cluster_box_plots[cluster].set_title(title)
    cluster_box_plots[cluster].set_xbound(0, x_scale)
    cluster_box_plots[cluster].set_xticks([])
    cluster_box_plots[cluster].figure.savefig('{}.png'.format(title),
                                              bbox_inches='tight')

In [None]:
try:
    del results_map
except:
    pass
colors = {
    0: 'red',
    1: 'green',
    2: 'navy',
    3: 'orange',
    4: 'violet',
    5: "yellow",
}

results_map = folium.Map(location=center[::-1], zoom_start=9)

folium.Circle(center[::-1], radius=50050).add_to(results_map)

for latlong, cluster in scored_profile['cluster'].iteritems():
    label = f'{latlong}'
    label = folium.Popup(label, parse_html=True)
    folium.Circle(latlong,
                  radius=1200 * 1.2,
                  popup=label,
                  color=colors[cluster],
                  opacity=.1,
                  fill=True,
                  fill_color=colors[cluster],
                  fill_opacity=0.25,
                  parse_html=False).add_to(results_map)
results_map