# GeoJSON Data EDA

In [1]:
import numpy as np
import pandas as pd

from urllib.request import urlopen
from requests_html import HTMLSession
import json

import itertools

import re

from time import time
from datetime import datetime, timedelta

from shapely.geometry import Polygon

# import geojson for boundaries and census areas

In [2]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    county_json = json.load(response)

In [3]:
county_json['features'][0]

{'type': 'Feature',
 'properties': {'GEO_ID': '0500000US01001',
  'STATE': '01',
  'COUNTY': '001',
  'NAME': 'Autauga',
  'LSAD': 'County',
  'CENSUSAREA': 594.436},
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-86.496774, 32.344437],
    [-86.717897, 32.402814],
    [-86.814912, 32.340803],
    [-86.890581, 32.502974],
    [-86.917595, 32.664169],
    [-86.71339, 32.661732],
    [-86.714219, 32.705694],
    [-86.413116, 32.707386],
    [-86.411172, 32.409937],
    [-86.496774, 32.344437]]]},
 'id': '01001'}

In [23]:
fips_to_add_to_json = list(set(nyt_df['fips']) - set([f['id'] for f in county_json['features']]))
fips_to_add_to_json

['kc', 'nyc', '02158', '46102', 'jm']

The `plotly` county GeoJSON dataset is missing Kusilvak Census Area (`'02158'`) and Oglala Lakota County(`'46102'`), in addition to the three cities included in the NYTimes data (New York City, Kansas City, Joplin). GeoJSON data for these five areas compiled from [nomanatim](https://nominatim.openstreetmap.org/) and [polygons](http://polygons.openstreetmap.fr/).

- Search for the area at [nomanatim](https://nominatim.openstreetmap.org/).
- Select `details` from the relevant entry.
- Copy the numeric `code` under `OSM`, ignoring "relation". Eg. for New York City, copy `175905`.
- Search for the `code` at [polygons](http://polygons.openstreetmap.fr/).
- For our purposes, GeoJSONs were selected according to the following criteria: (1) sparsity of vertices (`NPoints`) and (2) accuracy of shape.

In [24]:
# new york city, ny
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/nyc.txt') as response:
    nyc_json = json.load(response)

# kansas city, mo/ks
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kcm.txt') as response:
    kcm_json = json.load(response)
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kck.txt') as response:
    kck_json = json.load(response)
kc_json = dict(
    coordinates = kcm_json['coordinates'] + kck_json['coordinates']
)

# joplin, mo
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/jm.txt') as response:
    jm_json = json.load(response)

# oglala lakota county, nd
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/olsd.txt') as response:
    olsd_json = json.load(response)

# kusilvak census area, ak
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kca.txt') as response:
    kca_json = json.load(response)

In [25]:
# https://stackoverflow.com/questions/41271146/
def clean_coordinates(c):
    return [list(itertools.chain(*d)) for d in c]

In [26]:
add_to_json_dict = {
    '02158':{'area':17_081.43,
             'name':'Kusilvak Census Area',
             'coordinates':clean_coordinates(kca_json['coordinates'])}, 
    '46102':{'area':2_093.90,
             'name':'Oglala Lakota',
             'coordinates':clean_coordinates(olsd_json['coordinates'])},
    'jm':{'area':35.56,
          'name':'Joplin',
          'coordinates':clean_coordinates(jm_json['geometries'][0]['coordinates'])},
    'kc':{'area':124.81+314.95,
          'name':'Kansas City',
          'coordinates':clean_coordinates(kc_json['coordinates'])},
    'nyc':{'area':302.64,
           'name':'New York City',
           'coordinates':clean_coordinates(nyc_json['coordinates'])}
}

In [27]:
for fips in fips_to_add_to_json:
    county_json['features'].append(
        {
            'geometry': {'coordinates': add_to_json_dict[fips]['coordinates'],
                         'type': 'Polygon'},
            'id': fips,
            'properties': {'NAME': add_to_json_dict[fips]['name'],
                           'CENSUSAREA': add_to_json_dict[fips]['area']},
            'type': 'Feature'
        }
    )

In [28]:
with open('data/county_json.json', 'w') as f:
    json.dump(county_json, f)

In [29]:
def county_area(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            return d['properties']['CENSUSAREA']

In [30]:
for fips in fips_to_add_to_json:
    print(county_area(fips))

439.76
302.64
17081.43
2093.9
35.56


# add latitude and longitude coordinates

In [31]:
def centroid(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            shapes = np.array(d['geometry']['coordinates']).flatten()
            try:
                areas = [Polygon(shape).area for shape in shapes]
                p = Polygon(shapes[areas.index(max(areas))])
                lon, lat = p.centroid.coords[0]
            except:
                shapes = np.reshape(shapes, (-1, 2))
                p = Polygon(shapes)
                lon, lat = p.centroid.coords[0]
            return lon, lat

In [32]:
for d in county_json['features']:
    if d['id'] == fips_to_add_to_json[0]:
        shapes = d['geometry']['coordinates']

print(len(shapes))
areas = [Polygon(shape).area for shape in shapes]
areas

2


[0.0899984999999991, 0.035515830415977646]

In [33]:
for fips in fips_to_add_to_json:
    print(centroid(fips))

(-94.5544223903731, 39.1271954680726)
(-73.93936847673419, 40.66351613943744)
(-163.5144805328244, 62.15824463834583)
(-102.72778156028366, 43.18510744680853)
(-94.50560539941796, 37.07916084816835)


# add area and coordinates to `pop_df`

In [34]:
tick = time()
pop_df['area'] = pop_df['fips'].apply(county_area)
pop_df['lon'], pop_df['lat'] = zip(*pop_df['fips'].apply(centroid).to_list())
pop_df = optimize(pop_df)
tock = time()
print(tock - tick)

0.759967565536499


In [35]:
pop_df.head()

Unnamed: 0,state,county,population,fips,area,lon,lat
0,Alabama,Autauga,55869,1001,594.435974,-86.641197,32.536152
1,Alabama,Baldwin,223234,1003,1589.784058,-87.723953,30.725863
2,Alabama,Barbour,24686,1005,884.875977,-85.389244,31.867889
3,Alabama,Bibb,22394,1007,622.58197,-87.124962,32.996456
4,Alabama,Blount,57826,1009,644.776001,-86.569756,33.985249


In [36]:
pop_df.to_csv('data/pop_df.csv', index=False)