# Counties coordinates (lat, lon)
**Runs in about 30 s**

Two files are required to execute this code
- The top 500 cities in the US by population, based on 2010 census. Can be found at:
https://chronicdata.cdc.gov/500-Cities-Places/500-Cities-City-level-Data-GIS-Friendly-Format-201/pf7q-w24q

- All the US counties with the geolocation. Can be found at: https://www.weather.gov/gis/counties

In [1]:
# Import base packages
import pandas as pd
import numpy as np
import shapefile
import geopy.distance as gd
from fastkml import kml

Package `lxml` missing. Pretty print will be disabled


In [2]:
# Specify absolute path (this is where you stored the downloaded files above)
coords_dir = "/Users/alessandropreviero/Downloads/c_10nv20/c_10nv20.shp"
top_cities_dir = "/Users/alessandropreviero/Downloads/top500.csv"

### Read top cities

In [3]:
valid_cols = ['StateAbbr', 'PlaceName', 'Population2010', 'Geolocation']
cities_df = pd.read_csv(top_cities_dir, usecols=valid_cols).sort_values('Population2010', ascending=False, )

cities_df.rename(columns={
    'StateAbbr': 'state', 
    'PlaceName': 'city', 
    'Population2010': 'population', 
    'Geolocation': 'geolocation'
}, inplace=True)

# originally tuple of geolocation is a string, convert to numerical
cities_df['geolocation'] = [eval(v) for v in cities_df['geolocation']]

cities_df.reset_index(drop=True, inplace=True)
cities_df.head()

Unnamed: 0,state,city,population,geolocation
0,NY,New York,8175133,"(40.694960689, -73.9313850409)"
1,CA,Los Angeles,3792621,"(34.1182277898, -118.408500088)"
2,IL,Chicago,2695598,"(41.8372950615, -87.6862308732)"
3,TX,Houston,2099451,"(29.7806691396, -95.3860033966)"
4,PA,Philadelphia,1526006,"(40.0093147808, -75.1333888571)"


In [4]:
# Sanity check that we have 51 states

print(len(cities_df.state.unique()) == 51)

True


### Read all counties

In [5]:
sf = shapefile.Reader(coords_dir)
num_counties = len(sf.records())

In [6]:
county_df_colnames = ['state', 'county', 'geolocation']
county_coords = []

for idx in range(num_counties):
    record = sf.record(idx)
    state_code = record[0]
    county_name = record[2]
    lon = record[6]
    lat = record[7]
    county_coords.append([state_code, county_name, (lat, lon)])

counties_df = pd.DataFrame(county_coords, columns=county_df_colnames)
counties_df.head()

Unnamed: 0,state,county,geolocation
0,ME,Washington,"(45.0363, -67.6361)"
1,GA,McIntosh,"(31.5329, -81.2646)"
2,GA,Liberty,"(31.7093, -81.2103)"
3,AS,Swains Island,"(-11.0843, -171.0459)"
4,AS,Manu'a,"(-14.2219, -169.506)"


In [7]:
# Sanity check there is at least one county for each of the above states

print(len(counties_df.state.unique()))

55


In [8]:
def find_top_county_per_city(cities_df, counties_df):
    
    num_cities = cities_df.shape[0]
    num_counties = counties_df.shape[0]
    
    # Populate dictionary mapping cities to best county 
    # Cities can have same name across states, so need to be more specific in the dictionary key
    city_to_county = {f"{cities_df.iloc[i]['state']}, {cities_df.iloc[i]['city']}":''
                      for i in range(num_cities)}
    
    # For each city compute distance from it to each county (lat, lon order)
    for i, city in enumerate(cities_df.city):

        dists = []
        city_coord = cities_df.iloc[i]['geolocation']
        
        # Only counties within the city's state can be used
        counties_state_df = counties_df[counties_df.state == cities_df.iloc[i]['state']]

        for j, county in enumerate(counties_state_df.county):
            county_coord = counties_state_df.iloc[j]['geolocation']
            dists.append(gd.distance(city_coord, county_coord).km)
        
        # There are many counties with the same name, need to track State as well
        best_idx = np.argmin(dists)
        city_to_county[f"{cities_df.iloc[i]['state']}, {cities_df.iloc[i]['city']}"] = \
        counties_state_df.iloc[best_idx][['county', 'state']].to_numpy()
    
    return city_to_county

In [9]:
dc = find_top_county_per_city(cities_df, counties_df)

In [10]:
# Sanity: check we have one center per city
print(len(dc))

500


In [11]:
# Show an example

for k, v in dc.items():
    print(f"State: {v[1]}\nCity: {k}\nCounty: {v[0]}")
    break

State: NY
City: NY, New York
County: Kings


### Build final dataframe

In [12]:
colnames = ['state', 'city', 'county', 'city_geolocation', 'county_geolocation', 'city_pop']
final = []
for city, county in dc.items():
    
    bare_city = city.split(',')[1].strip()

    county_arr = counties_df[(counties_df.county==county[0]) &
                             (counties_df.state==county[1])
                            ][['state', 'geolocation']].to_numpy()[0]
    
    city_arr = cities_df[(cities_df.city == bare_city) &
                         (cities_df.state == county[1])
                        ][['geolocation', 'population']].to_numpy()[0]
    
    final.append([county_arr[0], bare_city, county[0], city_arr[0], county_arr[1], city_arr[1]])

complete_df = pd.DataFrame(final, columns=colnames)


In [13]:
complete_df.head()

Unnamed: 0,state,city,county,city_geolocation,county_geolocation,city_pop
0,NY,New York,Kings,"(40.694960689, -73.9313850409)","(40.6447, -73.9472)",8175133
1,CA,Los Angeles,Los Angeles,"(34.1182277898, -118.408500088)","(34.3203, -118.2252)",3792621
2,IL,Chicago,Cook,"(41.8372950615, -87.6862308732)","(41.8399, -87.8167)",2695598
3,TX,Houston,Harris,"(29.7806691396, -95.3860033966)","(29.8588, -95.3963)",2099451
4,PA,Philadelphia,Philadelphia,"(40.0093147808, -75.1333888571)","(40.0076, -75.1338)",1526006


### Build matrix of distances between counties

In [14]:
# Pre-populate array (rows are the assigned centers, columns are each existing county)
counties_dists = np.ones(shape=(complete_df.shape[0], counties_df.shape[0])) * -1.0

for i in range(complete_df.shape[0]):
    
    # Compute distance from selected county to all other counties within the same state
    dists = []
    center_coord = complete_df.iloc[i]['county_geolocation']

    counties_state_df = counties_df[counties_df.state == complete_df.iloc[i]['state']]
    nonzero_indices = counties_state_df.index
    
    for j, county in enumerate(counties_state_df.county):
        county_coord = counties_state_df.iloc[j]['geolocation']
        dists.append(gd.distance(center_coord, county_coord).km)
    
    counties_dists[i, nonzero_indices] = dists

In [15]:
counties_dists_df = pd.DataFrame(counties_dists, 
                                 columns=counties_df.county.to_numpy(), 
                                 index=complete_df.county.to_numpy())

In [16]:
counties_dists_df.head()

Unnamed: 0,Washington,McIntosh,Liberty,Swains Island,Manu'a,Western,Eastern,Anasco,Arroyo,Barceloneta,...,Petersburg Census Area,Skagway,Valdez-Cordova,Erie,New York (Manhattan),York,Kenedy,Willacy,Chippewa,Curry
Kings,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,462.316462,14.968662,-1.0,-1.0,-1.0,-1.0,-1.0
Los Angeles,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
Cook,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
Harris,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,396.503158,436.998932,-1.0,-1.0
Philadelphia,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [17]:
# One more sanity check (check distance of center with itself is 0)
example = counties_dists_df.loc["Kings"]
example[example > -1]

Yates                   343.516674
Allegany                385.075581
Broome                  229.867055
Cattaraugus             433.591751
Chemung                 288.013523
                           ...    
Queens                   18.021122
Kings                     0.000000
Jefferson               410.433725
Erie                    462.316462
New York (Manhattan)     14.968662
Name: Kings, Length: 67, dtype: float64

### Save all these big boys

In [18]:
complete_df.to_csv('selected_centers.csv')
counties_dists_df.to_csv('counties_distances.csv')