# Census and GeoJSON Data EDA

The goal of this notebook is to obtain and organize the following county-level data:

- nominal data: state, county, fips
- census data: 
    - total population
    - ethnic population(s)
    - voting statistics
    - median income
    - educational attainment
- geographic data (from GeoJSON): 
    - census area
    - latitude/longitude

In [27]:
import numpy as np
import pandas as pd

from urllib.request import urlopen
import json

import itertools

from time import time
from datetime import datetime, timedelta

from shapely.geometry import Polygon
from shapely.geometry import asShape
from geojson import Feature

# 1. import census data from `census.gov`

2019 population estimates can be collected from [census.gov](https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/asrh/). For the most current estimates, we will only save data from `YEAR == 12` ([data dictionary](https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/cc-est2019-alldata.pdf)).

In [29]:
with urlopen('https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/asrh/cc-est2019-alldata.csv') as response:
    dem_df = pd.read_csv(
        response,
        encoding='latin-1',        # to avoid unicode error
        dtype={'STATE':'str',
               'COUNTY':'str'}
    )
dem_df = dem_df.drop(columns='SUMLEV')         # SUMLEV == 50 for the 50 US states
dem_df = dem_df.loc[(dem_df['YEAR'] == 12) & (dem_df['AGEGRP'] == 0)]    # population estimate for 2019
dem_df = dem_df.drop(columns=['YEAR', 'AGEGRP'])
dem_df.head()

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,WA_FEMALE,BA_MALE,...,HWAC_MALE,HWAC_FEMALE,HBAC_MALE,HBAC_FEMALE,HIAC_MALE,HIAC_FEMALE,HAAC_MALE,HAAC_FEMALE,HNAC_MALE,HNAC_FEMALE
209,1,1,Alabama,Autauga County,55869,27092,28777,20878,21729,5237,...,778,687,89,93,40,27,15,19,16,11
437,1,3,Alabama,Baldwin County,223234,108247,114987,94810,100388,9486,...,5144,4646,268,281,264,197,69,65,55,35
665,1,5,Alabama,Barbour County,24686,13064,11622,6389,5745,6311,...,509,408,63,50,61,26,1,0,14,8
893,1,7,Alabama,Bibb County,22394,11929,10465,8766,8425,2941,...,291,253,32,19,6,15,5,1,17,3
1121,1,9,Alabama,Blount County,57826,28472,29354,27258,28154,516,...,2794,2516,76,58,67,66,18,21,34,21


In [30]:
dem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 209 to 716357
Data columns (total 77 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   STATE         3142 non-null   object
 1   COUNTY        3142 non-null   object
 2   STNAME        3142 non-null   object
 3   CTYNAME       3142 non-null   object
 4   TOT_POP       3142 non-null   int64 
 5   TOT_MALE      3142 non-null   int64 
 6   TOT_FEMALE    3142 non-null   int64 
 7   WA_MALE       3142 non-null   int64 
 8   WA_FEMALE     3142 non-null   int64 
 9   BA_MALE       3142 non-null   int64 
 10  BA_FEMALE     3142 non-null   int64 
 11  IA_MALE       3142 non-null   int64 
 12  IA_FEMALE     3142 non-null   int64 
 13  AA_MALE       3142 non-null   int64 
 14  AA_FEMALE     3142 non-null   int64 
 15  NA_MALE       3142 non-null   int64 
 16  NA_FEMALE     3142 non-null   int64 
 17  TOM_MALE      3142 non-null   int64 
 18  TOM_FEMALE    3142 non-null   int64 
 19  WA

Notice that county names provided by the US census contain descriptive terms, such as 'County', whereas the NYTimes data does not.

In [31]:
# remove descriptive terms from county names, will use on other dataframes
def remove_county_terms(data, county_col):
    county_terms = ['County', 'Parish', 'Municipality']
    for term in county_terms:
        data[county_col] = data[county_col].str.replace(' ' + term, '')
    return data

In [32]:
# rename columns to better-match nytimes data (and personal preference)
dem_df.rename(
    columns={
        'STATE':'statefips',
        'COUNTY':'countyfips',
        'STNAME':'state',
        'CTYNAME':'county',
        'TOT_POP': 'total_pop'
    }, inplace=True
)
dem_df.columns = dem_df.columns.str.lower()

# nytimes fips is 5-digit combo of state and county fips
dem_df['fips'] = dem_df['statefips'] + dem_df['countyfips']
dem_df = dem_df.drop(columns=['statefips', 'countyfips'])

dem_df = remove_county_terms(dem_df, 'county')

dem_df.pipe(remove_county_terms, 'county')
dem_df.head()

Unnamed: 0,state,county,total_pop,tot_male,tot_female,wa_male,wa_female,ba_male,ba_female,ia_male,...,hwac_female,hbac_male,hbac_female,hiac_male,hiac_female,haac_male,haac_female,hnac_male,hnac_female,fips
209,Alabama,Autauga,55869,27092,28777,20878,21729,5237,6000,121,...,687,89,93,40,27,15,19,16,11,1001
437,Alabama,Baldwin,223234,108247,114987,94810,100388,9486,10107,903,...,4646,268,281,264,197,69,65,55,35,1003
665,Alabama,Barbour,24686,13064,11622,6389,5745,6311,5595,103,...,408,63,50,61,26,1,0,14,8,1005
893,Alabama,Bibb,22394,11929,10465,8766,8425,2941,1822,53,...,253,32,19,6,15,5,1,17,3,1007
1121,Alabama,Blount,57826,28472,29354,27258,28154,516,462,192,...,2516,76,58,67,66,18,21,34,21,1009


## check county names against NYTimes data

We eventually need to merge with the NYTimes data, so let's see how they match with each other:

In [33]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = pd.read_csv(
        response,
        dtype={'fips':'str'}
    )
nyt_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [34]:
county_diffs = list(set(nyt_df['county']) - set(dem_df['county']))
len(county_diffs)

84

In [35]:
sorted([str(f) for f in county_diffs])

['Adjuntas',
 'Aguada',
 'Aguadilla',
 'Aguas Buenas',
 'Aibonito',
 'Anasco',
 'Arecibo',
 'Arroyo',
 'Barceloneta',
 'Barranquitas',
 'Bayamon',
 'Cabo Rojo',
 'Caguas',
 'Camuy',
 'Canovanas',
 'Carolina',
 'Catano',
 'Cayey',
 'Ceiba',
 'Ciales',
 'Cidra',
 'Coamo',
 'Comerio',
 'Corozal',
 'Culebra',
 'Dorado',
 'Fajardo',
 'Florida',
 'Guanica',
 'Guayama',
 'Guayanilla',
 'Guaynabo',
 'Gurabo',
 'Hatillo',
 'Hormigueros',
 'Humacao',
 'Isabela',
 'Jayuya',
 'Joplin',
 'Juana Diaz',
 'Juncos',
 'Kansas City',
 'Lajas',
 'Lares',
 'Las Marias',
 'Las Piedras',
 'Loiza',
 'Luquillo',
 'Manati',
 'Maricao',
 'Maunabo',
 'Mayaguez',
 'Moca',
 'Morovis',
 'Naguabo',
 'Naranjito',
 'New York City',
 'Orocovis',
 'Patillas',
 'Penuelas',
 'Ponce',
 'Quebradillas',
 'Rincon',
 'Sabana Grande',
 'Saipan',
 'Salinas',
 'San German',
 'San Lorenzo',
 'San Sebastian',
 'Santa Isabel',
 'St. John',
 'St. Thomas',
 'Tinian',
 'Toa Alta',
 'Toa Baja',
 'Trujillo Alto',
 'Unknown',
 'Utuado',
 '

As expected, the census county data is missing all municipios from [Puerto Rico](https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html) (`fips == 72`) as well as a couple from the Northern Mariana Islands (`fips == 69`), so we need to append that data to `pop_df`.

## make rows for New York City, Kansas City, and Joplin

Since the NYTimes dataset treats `New York City`, `Kansas City`, and `Joplin` [as their own entities](https://github.com/nytimes/covid-19-data#geographic-exceptions), we need to add them to our population dataframe.

### New York City

`New York City` is the combination of these five counties, [which are coterminous with the five boroughs](https://en.wikipedia.org/wiki/New_York_City#Boroughs):

- Bronx
- Kings
- New York
- Queens
- Richmond

We will arbitrarily assign the `fips` as `36NYC`.

In [36]:
boroughs = ['Bronx', 'Kings', 'New York', 'Queens', 'Richmond']

nyc_dem_df = dem_df[(dem_df['state'] == 'New York') & (dem_df['county'] == boroughs[0])].select_dtypes(include='number')

for b in boroughs[1:]:
    nyc_dem_df += dem_df[(dem_df['state'] == 'New York') & (dem_df['county'] == b)].select_dtypes(include='number').values

nyc_dem_df['state'] = 'New York'
nyc_dem_df['county'] = 'New York City'
nyc_dem_df['fips'] = '36NYC'
nyc_dem_df.head()

Unnamed: 0,total_pop,tot_male,tot_female,wa_male,wa_female,ba_male,ba_female,ia_male,ia_female,aa_male,...,hbac_female,hiac_male,hiac_female,haac_male,haac_female,hnac_male,hnac_female,state,county,fips
417449,8336817,3978439,4358378,2145238,2247804,1046937,1247761,57897,58600,597346,...,294492,65574,67225,21187,22500,8967,9468,New York,New York City,36NYC


In [37]:
dem_df = dem_df.append(nyc_dem_df, ignore_index=True)
dem_df[dem_df['fips'] == '36NYC']

Unnamed: 0,state,county,total_pop,tot_male,tot_female,wa_male,wa_female,ba_male,ba_female,ia_male,...,hwac_female,hbac_male,hbac_female,hiac_male,hiac_female,haac_male,haac_female,hnac_male,hnac_female,fips
3142,New York,New York City,8336817,3978439,4358378,2145238,2247804,1046937,1247761,57897,...,922001,257362,294492,65574,67225,21187,22500,8967,9468,36NYC


In [38]:
dem_df['white'] = (dem_df['nhwa_male']+dem_df['nhwa_female'])
dem_df['black'] = (dem_df['nhba_male']+dem_df['nhba_female'])
dem_df['asian'] = (dem_df['nhaa_male']+dem_df['nhaa_female'])
dem_df['hispanic'] = (dem_df['h_male']+dem_df['h_female'])

In [39]:
pop_cols = ['state', 'county', 'total_pop', 'fips', 'white', 'black', 'asian', 'hispanic']
pop_df = dem_df[pop_cols]
pop_df.tail()

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic
3138,Wyoming,Teton,23464,56039,19000,145,378,3554
3139,Wyoming,Uinta,20226,56041,17657,126,92,1871
3140,Wyoming,Washakie,7805,56043,6417,38,55,1108
3141,Wyoming,Weston,6927,56045,6236,45,113,285
3142,New York,New York City,8336817,36NYC,2681976,1825848,1228598,2423590


### Kansas City and Joplin

Kansas City and Joplin both refer to cities that cross county borders in Missouri. Therefore, we have to get our information from [census.gov quickfacts](https://www.census.gov/quickfacts).

We'll use `29KAN`, and `29JOP` as our `fips` for these two cities.

In [40]:
pop_df2 = pd.DataFrame(
    [['Missouri',
      'Kansas City',
      495_327,
      '29KAN',
      int(0.601*495_327),
      int(0.290*495_327),
      int(0.027*495_327),
      int(0.102*495_327)],
     ['Missouri',
      'Joplin',
      50_925,
      '29JOP',
      int(0.876*50_925),
      int(0.032*50_925),
      int(0.019*50_925),
      int(0.048*50_925)
     ]]
    , columns=pop_cols)
pop_df2

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic
0,Missouri,Kansas City,495327,29KAN,297691,143644,13373,50523
1,Missouri,Joplin,50925,29JOP,44610,1629,967,2444


In [41]:
pop_df = pop_df.append(pop_df2, ignore_index=True)
pop_df.tail(5)

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic
3140,Wyoming,Washakie,7805,56043,6417,38,55,1108
3141,Wyoming,Weston,6927,56045,6236,45,113,285
3142,New York,New York City,8336817,36NYC,2681976,1825848,1228598,2423590
3143,Missouri,Kansas City,495327,29KAN,297691,143644,13373,50523
3144,Missouri,Joplin,50925,29JOP,44610,1629,967,2444


# 2. import geojson for boundaries and census areas

In [42]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    county_json = json.load(response)

In [43]:
# inspect structure of geojson
county_json['features'][0]

{'type': 'Feature',
 'properties': {'GEO_ID': '0500000US01001',
  'STATE': '01',
  'COUNTY': '001',
  'NAME': 'Autauga',
  'LSAD': 'County',
  'CENSUSAREA': 594.436},
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-86.496774, 32.344437],
    [-86.717897, 32.402814],
    [-86.814912, 32.340803],
    [-86.890581, 32.502974],
    [-86.917595, 32.664169],
    [-86.71339, 32.661732],
    [-86.714219, 32.705694],
    [-86.413116, 32.707386],
    [-86.411172, 32.409937],
    [-86.496774, 32.344437]]]},
 'id': '01001'}

## add areas to `county_json`

In [45]:
fips_to_add_to_json = list(set(nyt_df['fips']) - set([f['id'] for f in county_json['features']]))
fips_to_add_to_json

[nan, '78020', '69120', '02158', '69110', '46102', '78030', '78010']

The `plotly` county GeoJSON dataset is missing Kusilvak Census Area (`'02158'`) and Oglala Lakota County(`'46102'`), in addition to the three cities included in the NYTimes data (New York City, Kansas City, Joplin). 

**FUTURE WORK**: we will ignore the following entries in `nyt_df`:
- Northern Mariana Islands (`69xxx`)
- US Virgin Islands (`78xxx`)
- state totals from an unknown county source (`nan`)

GeoJSON data for these five areas compiled from [nomanatim](https://nominatim.openstreetmap.org/) and [polygons](http://polygons.openstreetmap.fr/):
- Search for the area at [nomanatim](https://nominatim.openstreetmap.org/).
- Select `details` from the relevant entry.
- Copy the numeric `code` under `OSM`, ignoring "relation". Eg. for New York City, copy `175905`.
- Search for the `code` at [polygons](http://polygons.openstreetmap.fr/).
- For our purposes, GeoJSONs were selected according to the following criteria: (1) sparsity of vertices (`NPoints`) and (2) accuracy of shape.

In [46]:
# new york city, ny
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/nyc.txt') as response:
    nyc_json = json.load(response)

# kansas city, mo
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kcm.txt') as response:
    kcm_json = json.load(response)

# joplin, mo
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/jm.txt') as response:
    jm_json = json.load(response)

# oglala lakota county, nd
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/olsd.txt') as response:
    olsd_json = json.load(response)

# kusilvak census area, ak
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kca.txt') as response:
    kca_json = json.load(response)

In [50]:
add_to_json_dict = {
    '02158':{
        'properties':{
            'CENSUSAREA':17_081.43,
            'NAME':'Kusilvak Census Area'
        }, 'geometry':asShape(kca_json).buffer(0)
    }, 
    '46102':{
        'properties':{
            'CENSUSAREA':2_093.90,
            'NAME':'Oglala Lakota'
        }, 'geometry':asShape(olsd_json).buffer(0)
    },
    '29JOP':{
        'properties':{
            'CENSUSAREA':35.56,
            'NAME':'Joplin'
        }, 'geometry':asShape(jm_json).buffer(0)
    },
    '29KAN':{
        'properties':{
            'CENSUSAREA':314.95,
            'NAME':'Kansas City'
        }, 'geometry':asShape(kcm_json).buffer(0)
    },
    '36NYC':{
        'properties':{
            'CENSUSAREA':302.64,
            'NAME':'New York City'
        }, 'geometry':asShape(nyc_json).buffer(0)
    }
}

In [52]:
for fips in ['02158', '46102', '29KAN', '29JOP', '36NYC']:
    county_json['features'].append(
        Feature(id=fips,
                geometry=add_to_json_dict[fips]['geometry'],
                properties=add_to_json_dict[fips]['properties'])
    )

In [53]:
with open('data/county_json.json', 'w') as f:
    json.dump(county_json, f)

## add centroid latitude and longitude coordinates and county area from `county_json` to `pop_df`

We will use `shapely` to calculate the [centroid](https://en.wikipedia.org/wiki/Centroid) coordinates for the counties (in case we wish to plot bubble maps).

In [62]:
def centroid(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            polygon = asShape(d['geometry']).buffer(0)
            return polygon.centroid.coords[0]
        
def county_area(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            return d['properties']['CENSUSAREA']

# def centroid(i, j=county_json):
#     for d in j['features']:
#         if d['id'] == i:
#             shapes = np.array(d['geometry']['coordinates'])
#             # if county_json has multiple polygons
#             if shapes.ndim != 2:
#                 areas = [Polygon(shape).area for shape in shapes]
#                 coords = [Polygon(shape).centroid.coords[0] for shape in shapes]
#                 lon = np.average(list(zip(*coords))[0], weights=areas)
#                 lat = np.average(list(zip(*coords))[1], weights=areas)
#             # if county_json has one polygon
#             else:
#                 shapes = np.reshape(shapes, (-1, 2))
#                 p = Polygon(shapes)
#                 lon, lat = p.centroid.coords[0]
#             return lon, lat

In [63]:
tick = time()
pop_df['area'] = pop_df['fips'].apply(county_area)
pop_df['lon'], pop_df['lat'] = zip(*pop_df['fips'].apply(centroid).to_list())
# pop_df = optimize(pop_df)
tock = time()
print(tock - tick)
pop_df.tail()

0.9704318046569824


Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic,area,lon,lat
3140,Wyoming,Washakie,7805,56043,6417,38,55,1108,2238.549,-107.681649,43.904771
3141,Wyoming,Weston,6927,56045,6236,45,113,285,2398.089,-104.56729,43.839661
3142,New York,New York City,8336817,36NYC,2681976,1825848,1228598,2423590,302.64,-73.939368,40.663516
3143,Missouri,Kansas City,495327,29KAN,297691,143644,13373,50523,314.95,-94.554422,39.127195
3144,Missouri,Joplin,50925,29JOP,44610,1629,967,2444,35.56,-94.505664,37.078985


# 3. add 2016 general election data

Mask compliance has been very political, so it would be interesting to see how political differences vary by county. Data taken from [github.com/tonmcg](https://github.com/tonmcg). Alaska data taken from [RRH Elections](https://rrhelections.com/index.php/2018/02/02/alaska-results-by-county-equivalent-1960-2016/).

In [64]:
with urlopen('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-16/master/2016_US_County_Level_Presidential_Results.csv') as response:
    elect_df = pd.read_csv(
        response,
        encoding='latin-1',        # to avoid unicode error
        dtype={
            'votes_dem':'int',
            'votes_gop':'int',
            'total_votes':'int',
            'combined_fips':'str'},
        index_col=0
    )
elect_df.head()

Unnamed: 0,votes_dem,votes_gop,total_votes,per_dem,per_gop,diff,per_point_diff,state_abbr,county_name,combined_fips
0,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2013
1,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2016
2,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2020
3,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2050
4,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2060


In [65]:
elect_df.rename(
    columns={
        'county_name':'county',
        'combined_fips':'fips',
    }, inplace=True
)

elect_df = remove_county_terms(elect_df, 'county')
elect_df['fips'] = elect_df['fips'].apply('{0:0>5}'.format)        # https://stackoverflow.com/a/23836353
elect_df = elect_df[['state_abbr', 'county', 'fips', 'votes_dem', 'votes_gop', 'total_votes']]
elect_df = elect_df.sort_values(by='fips')
elect_df.head()

Unnamed: 0,state_abbr,county,fips,votes_dem,votes_gop,total_votes
29,AL,Autauga,1001,5908,18110,24661
30,AL,Baldwin,1003,18409,72780,94090
31,AL,Barbour,1005,4848,5431,10390
32,AL,Bibb,1007,1874,6733,8748
33,AL,Blount,1009,2150,22808,25384


## add New York City, Kansas City, and Joplin election data

In [66]:
nyc_elect_df = elect_df[(elect_df['state_abbr'] == 'NY') & (elect_df['county'] == boroughs[0])].select_dtypes(include='number')

for b in boroughs[1:]:
    nyc_elect_df += elect_df[(elect_df['state_abbr'] == 'NY') & (elect_df['county'] == b)].select_dtypes(include='number').values
    
nyc_elect_df

Unnamed: 0,votes_dem,votes_gop,total_votes
1982,1969920,461174,2490750


In [33]:
# nyc_elect_df['per_dem'] = nyc_elect_df['votes_dem'] / nyc_elect_df['total_votes']
# nyc_elect_df['per_gop'] = nyc_elect_df['votes_gop'] / nyc_elect_df['total_votes']

In [67]:
nyc_elect_df['state_abbr'] = 'NY'
nyc_elect_df['county'] = 'New York City'
nyc_elect_df['fips'] = '36NYC'
nyc_elect_df.head()

Unnamed: 0,votes_dem,votes_gop,total_votes,state_abbr,county,fips
1982,1969920,461174,2490750,NY,New York City,36NYC


In [68]:
elect_df = elect_df.append(nyc_elect_df, ignore_index=True)
elect_df.tail()

Unnamed: 0,state_abbr,county,fips,votes_dem,votes_gop,total_votes
3137,WY,Teton,56039,7313,3920,12176
3138,WY,Uinta,56041,1202,6154,8053
3139,WY,Washakie,56043,532,2911,3715
3140,WY,Weston,56045,294,2898,3334
3141,NY,New York City,36NYC,1969920,461174,2490750


## add alaska elections data

Data taken from [RRH Elections](https://rrhelections.com/index.php/2018/02/02/alaska-results-by-county-equivalent-1960-2016/).

In [69]:
ak_elect_df = pd.read_excel('data/2016 AK Gen Official.xlsx', sheet_name='By CE')
ak_elect_df = ak_elect_df.iloc[0:29, 0:12]
ak_elect_df.head()

Unnamed: 0,ED/Muni,Municipality Code,Registered Voters,Times Counted,"Castle, Darrell L.","Clinton, Hillary","De La Fuente, Roque","Johnson, Gary","Stein, Jill","Trump, Donald J.",Write-in 60,ED Total
0,Ketchikan Gateway,Ketchikan,10512,4283,48,1295,13,339,84,2354,104,4237
1,Prince of Wales-Hyder,Prince of Wales-Hyder,4630,1831,67,666,29,93,65,831,59,1810
2,Sitka,Sitka,7218,2787,38,1261,18,145,78,1146,73,2759
3,Petersburg,Petersburg,2741,1078,12,334,7,64,37,577,32,1063
4,Wrangell,Wrangell,1731,764,7,177,3,35,13,512,13,760


In [70]:
ak_elect_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ED/Muni               29 non-null     object
 1   Municipality Code     29 non-null     object
 2   Registered Voters     29 non-null     object
 3   Times Counted         29 non-null     object
 4   Castle, Darrell L.    29 non-null     object
 5   Clinton, Hillary      29 non-null     object
 6   De La Fuente, Roque   29 non-null     object
 7   Johnson, Gary         29 non-null     object
 8   Stein, Jill           29 non-null     object
 9   Trump, Donald J.      29 non-null     object
 10  Write-in 60           29 non-null     object
 11  ED Total              29 non-null     object
dtypes: object(12)
memory usage: 2.8+ KB


In [71]:
ak_elect_df.rename(
    columns={
        'Trump, Donald J. ':'votes_gop',
        'Clinton, Hillary ':'votes_dem'
    }, inplace=True
)
ak_elect_df = ak_elect_df[['ED/Muni', 'votes_gop', 'votes_dem', 'ED Total']].sort_values(by='ED/Muni')
ak_elect_df[['votes_gop', 'votes_dem', 'ED Total']] = ak_elect_df[['votes_gop', 'votes_dem', 'ED Total']].astype(int)
ak_elect_df = ak_elect_df.sort_values(by='ED/Muni')
ak_elect_df.head()

Unnamed: 0,ED/Muni,votes_gop,votes_dem,ED Total
22,Aleutians East,198,121,369
24,Aleutians West,260,493,846
19,Anchorage,39942,32130,81678
12,Bethel,809,2178,3933
25,Bristol Bay,180,99,316


In [72]:
print(len(ak_elect_df))
print(len(elect_df[elect_df['state_abbr'] == 'AK']))

29
29


In [73]:
elect_df.loc[elect_df['state_abbr'] == 'AK', ['votes_gop', 'votes_dem', 'total_votes']] = ak_elect_df[['votes_gop', 'votes_dem', 'ED Total']].values

In [74]:
# elect_df = elect_df.drop(columns=['votes_dem', 'votes_gop'])
elect_df.tail()

Unnamed: 0,state_abbr,county,fips,votes_dem,votes_gop,total_votes
3137,WY,Teton,56039,7313,3920,12176
3138,WY,Uinta,56041,1202,6154,8053
3139,WY,Washakie,56043,532,2911,3715
3140,WY,Weston,56045,294,2898,3334
3141,NY,New York City,36NYC,1969920,461174,2490750


In [75]:
elect_df[elect_df['state_abbr'] == 'AK'].head()

Unnamed: 0,state_abbr,county,fips,votes_dem,votes_gop,total_votes
67,AK,Alaska,2013,121,198,369
68,AK,Alaska,2016,493,260,846
69,AK,Alaska,2020,32130,39942,81678
70,AK,Alaska,2050,2178,809,3933
71,AK,Alaska,2060,99,180,316


In [76]:
pop_df = pop_df.merge(elect_df[['fips', 'votes_gop', 'votes_dem', 'total_votes']], on='fips', how='left')

### adjusted 2-party voting

In [78]:
# pop_df['per_gop'] = pop_df['per_votes_gop'] / (pop_df['per_votes_gop'] + pop_df['per_votes_dem'])

# 4. add income data

Median income statistics taken from [data.census.gov](https://data.census.gov/cedsci/table?q=s1901&tid=ACSST1Y2018.S1901) (2017 ACS 1-Year Estimates). According to the data table, the median household income is reported in column `S1901_C01_012E`.

In [79]:
income_df = pd.read_csv('data/ACSST5Y2018.S1901_data_with_overlays_2020-07-16T134009.csv',
                        usecols=['GEO_ID', 'NAME', 'S1901_C01_012E'])
income_df = income_df.drop(0, axis=0)
income_df.rename(
    columns={
        'GEO_ID':'fips',
        'S1901_C01_012E':'median_income'
    }, inplace=True
)
income_df['median_income'] = income_df['median_income'].astype(float)
income_df['fips'] = income_df['fips'].str[-5:]
income_df.head()

Unnamed: 0,fips,NAME,median_income
1,1001,"Autauga County, Alabama",58786.0
2,1003,"Baldwin County, Alabama",55962.0
3,1005,"Barbour County, Alabama",34186.0
4,1007,"Bibb County, Alabama",45340.0
5,1009,"Blount County, Alabama",48695.0


In [80]:
income_df[income_df['median_income'].isna()]

Unnamed: 0,fips,NAME,median_income
1817,35039,"Rio Arriba County, New Mexico",


Rio Arriba Income statistics taken from [datausa.io](https://datausa.io/profile/geo/rio-arriba-county-nm#:~:text=Median%20household%20income%20in%20Rio%20Arriba%20County%2C%20NM%20is%20%2433%2C422.)

In [81]:
income_df.at[income_df['fips'] == '35039', 'median_income'] = 33_422

In [82]:
income_df['county'], income_df['state'] = zip(*income_df['NAME'].str.split(', ').tolist())
income_df = income_df.drop('NAME', axis=1)
income_df = remove_county_terms(income_df, 'county')
income_df['median_income'] = income_df['median_income'].astype(int)
income_df.head()

Unnamed: 0,fips,median_income,county,state
1,1001,58786,Autauga,Alabama
2,1003,55962,Baldwin,Alabama
3,1005,34186,Barbour,Alabama
4,1007,45340,Bibb,Alabama
5,1009,48695,Blount,Alabama


In [83]:
# only an estimate of median income

nyc_income = income_df[(income_df['state'] == 'New York') & (income_df['county'] == boroughs[0])]['median_income'].values \
                * pop_df[(pop_df['state'] == 'New York') & (pop_df['county'] == boroughs[0])]['total_pop'].values
nyc_pop = pop_df[(pop_df['state'] == 'New York') & (pop_df['county'] == boroughs[0])]['total_pop'].values

for b in boroughs[1:]:
    nyc_income += income_df[(income_df['state'] == 'New York') & (income_df['county'] == b)]['median_income'].values \
                     * pop_df[(pop_df['state'] == 'New York') & (pop_df['county'] == b)]['total_pop'].values
    nyc_pop += pop_df[(pop_df['state'] == 'New York') & (pop_df['county'] == b)]['total_pop'].values
    
nyc_income = nyc_income / nyc_pop
nyc_income[0]

61884.61918031786

In [84]:
income_df2 = pd.DataFrame(
    [['36NYC', int(nyc_income[0]), 'New York City', 'New York'],
     ['29KAN', 52405, 'Kansas City', 'Missouri'],
     ['29JOP', 42782, 'Joplin', 'Missouri']]
    , columns=income_df.columns)
income_df2

Unnamed: 0,fips,median_income,county,state
0,36NYC,61884,New York City,New York
1,29KAN,52405,Kansas City,Missouri
2,29JOP,42782,Joplin,Missouri


In [85]:
income_df = income_df.append(income_df2, ignore_index=True)
income_df.tail(5)

Unnamed: 0,fips,median_income,county,state
3218,72151,16013,Yabucoa Municipio,Puerto Rico
3219,72153,14954,Yauco Municipio,Puerto Rico
3220,36NYC,61884,New York City,New York
3221,29KAN,52405,Kansas City,Missouri
3222,29JOP,42782,Joplin,Missouri


In [86]:
pop_df = pop_df.merge(income_df[['fips', 'median_income']], on='fips', how='left')
pop_df.head()

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic,area,lon,lat,votes_gop,votes_dem,total_votes,median_income
0,Alabama,Autauga,55869,1001,41215,11098,646,1671,594.436,-86.641196,32.536153,18110.0,5908.0,24661.0,58786
1,Alabama,Baldwin,223234,1003,185747,19215,2346,10534,1589.784,-87.723954,30.725862,72780.0,18409.0,94090.0,55962
2,Alabama,Barbour,24686,1005,11235,11807,116,1117,884.876,-85.389245,31.867889,5431.0,4848.0,10390.0,34186
3,Alabama,Bibb,22394,1007,16663,4719,46,623,622.582,-87.124963,32.996456,6733.0,1874.0,8748.0,45340
4,Alabama,Blount,57826,1009,50176,872,163,5582,644.776,-86.569756,33.985248,22808.0,2150.0,25384.0,48695


# 5. add educational attainment data

Educational attainment statistics taken from [data.census.gov](https://data.census.gov/cedsci/table?tid=ACSST5Y2018.S1501&g=0400000US04) (2017 ACS 1-Year Estimates).

- `S1501_C01_006E` -- population > 25yo
- `S1501_C01_007E` -- less than 9th grade
- `S1501_C01_008E` -- some high school
- `S1501_C01_009E` -- high school or GED
- `S1501_C01_010E` -- some college
- `S1501_C01_011E` -- associate's
- `S1501_C01_012E` -- bachelor's
- `S1501_C01_013E` -- graduate or professional

In [88]:
edu_cols = ['S1501_C01_'+f'{i:03d}'+'E' for i in range(6,14)]
edu_col_names = ['pop25', 'no_hs', 'some_hs', 'hs', 'some_college', 'associates', 'bachelors', 'graduate']
edu_dict = dict(zip(edu_cols, edu_col_names))
edu_dict.update({'GEO_ID':'fips'})

edu_df = pd.read_csv('data/ACSST5Y2018.S1501_data_with_overlays_2020-07-18T170455.csv',
                     usecols=['GEO_ID', 'NAME']+edu_cols)
edu_df = edu_df.drop(0, axis=0)
for col in edu_cols:
    edu_df[col] = edu_df[col].astype(int)
edu_df.rename(
    columns=edu_dict,
    inplace=True
)
edu_df['fips'] = edu_df['fips'].str[-5:]
edu_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,fips,NAME,pop25,no_hs,some_hs,hs,some_college,associates,bachelors,graduate
1,1001,"Autauga County, Alabama",37166,956,3248,12119,7554,2998,5903,4388
2,1003,"Baldwin County, Alabama",146989,3978,10332,40579,32266,13759,30431,15644
3,1005,"Barbour County, Alabama",18173,1490,3411,6486,3287,1279,1417,803
4,1007,"Bibb County, Alabama",15780,903,1747,7471,2938,908,1197,616
5,1009,"Blount County, Alabama",39627,2967,4894,13489,8492,4775,3217,1793


In [89]:
edu_df['county'], edu_df['state'] = zip(*edu_df['NAME'].str.split(', ').tolist())
edu_df = edu_df.drop('NAME', axis=1)
edu_df = remove_county_terms(edu_df, 'county')
edu_df.head()

Unnamed: 0,fips,pop25,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state
1,1001,37166,956,3248,12119,7554,2998,5903,4388,Autauga,Alabama
2,1003,146989,3978,10332,40579,32266,13759,30431,15644,Baldwin,Alabama
3,1005,18173,1490,3411,6486,3287,1279,1417,803,Barbour,Alabama
4,1007,15780,903,1747,7471,2938,908,1197,616,Bibb,Alabama
5,1009,39627,2967,4894,13489,8492,4775,3217,1793,Blount,Alabama


In [90]:
nyc_edu_df = edu_df[(edu_df['state'] == 'New York') & (edu_df['county'] == boroughs[0])].select_dtypes(include='number')

for b in boroughs[1:]:
    nyc_edu_df += edu_df[(edu_df['state'] == 'New York') & (edu_df['county'] == b)].select_dtypes(include='number').values

nyc_edu_df['state'] = 'New York'
nyc_edu_df['county'] = 'New York City'
nyc_edu_df['fips'] = '36NYC'
nyc_edu_df.head()

Unnamed: 0,pop25,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,state,county,fips
1831,5923498,565345,523873,1421617,815961,379457,1292814,924431,New York,New York City,36NYC


In [91]:
edu_df = edu_df.append(nyc_edu_df, ignore_index=True)
edu_df.tail()

Unnamed: 0,fips,pop25,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state
3216,72147,6282,1096,480,2344,487,580,933,362,Vieques Municipio,Puerto Rico
3217,72149,15288,2599,1134,5616,1648,1218,2545,528,Villalba Municipio,Puerto Rico
3218,72151,23916,4975,2245,5972,3636,2645,3706,737,Yabucoa Municipio,Puerto Rico
3219,72153,25976,4977,2259,8182,2381,1791,4902,1484,Yauco Municipio,Puerto Rico
3220,36NYC,5923498,565345,523873,1421617,815961,379457,1292814,924431,New York City,New York


In [92]:
edu_df2 = pd.DataFrame(
    [['29KAN', 325065, 11373, 22302, 82996, 73203, 23673, 69682, 41836, 'Kansas City', 'Missouri'],
     ['29JOP', 33571, 779, 2580, 10582, 8462, 2576, 5759, 2833, 'Joplin', 'Missouri']]
    , columns=edu_df.columns)
edu_df2

Unnamed: 0,fips,pop25,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state
0,29KAN,325065,11373,22302,82996,73203,23673,69682,41836,Kansas City,Missouri
1,29JOP,33571,779,2580,10582,8462,2576,5759,2833,Joplin,Missouri


In [93]:
edu_df = edu_df.append(edu_df2, ignore_index=True)
edu_df.tail()

Unnamed: 0,fips,pop25,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state
3218,72151,23916,4975,2245,5972,3636,2645,3706,737,Yabucoa Municipio,Puerto Rico
3219,72153,25976,4977,2259,8182,2381,1791,4902,1484,Yauco Municipio,Puerto Rico
3220,36NYC,5923498,565345,523873,1421617,815961,379457,1292814,924431,New York City,New York
3221,29KAN,325065,11373,22302,82996,73203,23673,69682,41836,Kansas City,Missouri
3222,29JOP,33571,779,2580,10582,8462,2576,5759,2833,Joplin,Missouri


In [94]:
pop_df = pop_df.merge(edu_df[['fips']+edu_col_names], on='fips', how='left')
pop_df.tail()

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic,area,lon,...,total_votes,median_income,pop25,no_hs,some_hs,hs,some_college,associates,bachelors,graduate
3140,Wyoming,Washakie,7805,56043,6417,38,55,1108,2238.549,-107.681649,...,3715.0,53426,5662,181,409,1717,1434,701,849,371
3141,Wyoming,Weston,6927,56045,6236,45,113,285,2398.089,-104.56729,...,3334.0,52867,5014,129,260,1796,1334,534,676,285
3142,New York,New York City,8336817,36NYC,2681976,1825848,1228598,2423590,302.64,-73.939368,...,2490750.0,61884,5923498,565345,523873,1421617,815961,379457,1292814,924431
3143,Missouri,Kansas City,495327,29KAN,297691,143644,13373,50523,314.95,-94.554422,...,,52405,325065,11373,22302,82996,73203,23673,69682,41836
3144,Missouri,Joplin,50925,29JOP,44610,1629,967,2444,35.56,-94.505664,...,,42782,33571,779,2580,10582,8462,2576,5759,2833


In [95]:
pop_df['pop_density'] = pop_df['total_pop'] / pop_df['area']
for col in ['white', 'black', 'asian', 'hispanic', 'total_votes']:
    pop_df['per_' + col] = pop_df[col] / pop_df['total_pop']
for col in ['votes_gop', 'votes_dem']:
    pop_df['per_' + col] = pop_df[col] / pop_df['total_votes']
for col in ['no_hs', 'some_hs', 'hs', 'associates', 'bachelors', 'graduate']:
    pop_df['per_' + col] = pop_df[col] / pop_df['pop25']

In [96]:
pop_df['per_college'] = (pop_df['associates'] + pop_df['bachelors'] + pop_df['graduate']) / pop_df['pop25']
pop_df['per_gop'] = pop_df['votes_gop'] / (pop_df['votes_gop'] + pop_df['votes_dem'])
pop_df['gop'] = pop_df['votes_gop'] > pop_df['votes_dem']

In [97]:
pop_df.head()

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic,area,lon,...,per_votes_dem,per_no_hs,per_some_hs,per_hs,per_associates,per_bachelors,per_graduate,per_college,per_gop,gop
0,Alabama,Autauga,55869,1001,41215,11098,646,1671,594.436,-86.641196,...,0.239569,0.025722,0.087392,0.326078,0.080665,0.158828,0.118065,0.357558,0.754018,True
1,Alabama,Baldwin,223234,1003,185747,19215,2346,10534,1589.784,-87.723954,...,0.195653,0.027063,0.070291,0.276068,0.093606,0.207029,0.10643,0.407064,0.798123,True
2,Alabama,Barbour,24686,1005,11235,11807,116,1117,884.876,-85.389245,...,0.466603,0.08199,0.187696,0.356903,0.070379,0.077973,0.044186,0.192538,0.528359,True
3,Alabama,Bibb,22394,1007,16663,4719,46,623,622.582,-87.124963,...,0.21422,0.057224,0.11071,0.473447,0.057541,0.075856,0.039037,0.172433,0.78227,True
4,Alabama,Blount,57826,1009,50176,872,163,5582,644.776,-86.569756,...,0.084699,0.074873,0.123502,0.340399,0.120499,0.081182,0.045247,0.246928,0.913855,True


## save results to csv

In [98]:
dem_df.to_csv('data/dem_df.csv', index=False)
pop_df.to_csv('data/pop_df.csv', index=False)

In [101]:
from sklearn import metrics
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [103]:
ct = ColumnTransformer(transformers=[
    ('mm1', MinMaxScaler(feature_range=(0,1)), ['per_white', 'per_black', 'per_asian', 'per_votes_gop']),
    ('mm2', MinMaxScaler(feature_range=(0,5)), ['lon', 'lat']),
    ('mm3', MinMaxScaler(feature_range=(0,4)), ['total_pop']),
    ('mm4', MinMaxScaler(feature_range=(0,4)), ['pop_density']),
    ('mm5', MinMaxScaler(feature_range=(0,3)), ['median_income']),
    ('mm6', MinMaxScaler(feature_range=(0,3)), ['per_no_hs', 'per_some_hs', 'per_hs', 'per_associates', 'per_bachelors'])
])

In [104]:
cw = SklearnWrapper(ct)

In [129]:
pop_df[pop_df.isna().any(axis=1)]

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic,area,lon,...,per_votes_dem,per_no_hs,per_some_hs,per_hs,per_associates,per_bachelors,per_graduate,per_college,per_gop,gop
81,Alaska,Kusilvak Census Area,8314,02158,281,32,35,249,17081.43,-163.514481,...,,0.075247,0.147707,0.536357,0.026856,0.033443,0.014695,0.074994,,False
548,Hawaii,Kalawao,86,15005,23,0,7,1,11.991,-156.95113,...,,0.043478,0.086957,0.275362,0.173913,0.188406,0.057971,0.42029,,False
2412,South Dakota,Oglala Lakota,14177,46102,669,55,18,579,2093.9,-102.727491,...,,0.048483,0.191439,0.253082,0.08727,0.086993,0.030614,0.204876,,False
3143,Missouri,Kansas City,495327,29KAN,297691,143644,13373,50523,314.95,-94.554422,...,,0.034987,0.068608,0.255321,0.072825,0.214363,0.1287,0.415889,,False
3144,Missouri,Joplin,50925,29JOP,44610,1629,967,2444,35.56,-94.505664,...,,0.023205,0.076852,0.315213,0.076733,0.171547,0.084388,0.332668,,False


In [131]:
pop_df2 = pop_df.fillna(0.5)

In [132]:
grouped = pop_df2.groupby(by='state')

In [116]:
grouped.pipe(lambda g: len(g))

51

In [133]:
ctg = grouped.apply(lambda g: ct.fit_transform(g[cols]))

In [134]:
ctg.apply(np.average)

state
Alabama                 1.086205
Alaska                  1.029937
Arizona                 1.094745
Arkansas                1.026340
California              1.016777
Colorado                0.975909
Connecticut             1.415231
Delaware                1.353726
District of Columbia    0.000000
Florida                 1.164808
Georgia                 0.989665
Hawaii                  1.287798
Idaho                   0.904749
Illinois                0.988212
Indiana                 1.002659
Iowa                    0.995558
Kansas                  0.977500
Kentucky                1.036492
Louisiana               1.013580
Maine                   1.094182
Maryland                1.247166
Massachusetts           1.271021
Michigan                1.067565
Minnesota               0.892140
Mississippi             1.175074
Missouri                1.022765
Montana                 1.142088
Nebraska                0.975901
Nevada                  1.155735
New Hampshire           1.213934
New 

In [150]:
len(set(grouped.apply(lambda g: AgglomerativeClustering(n_clusters=len(g)-5).fit(ct.fit_transform(g[cols])).labels_ if len(g) > 6 else 0)['Alabama']))

62

In [140]:
pop_df2.head()

Unnamed: 0,state,county,total_pop,fips,white,black,asian,hispanic,area,lon,...,per_no_hs,per_some_hs,per_hs,per_associates,per_bachelors,per_graduate,per_college,per_gop,gop,cluster
0,Alabama,Autauga,55869,1001,41215,11098,646,1671,594.436,-86.641196,...,0.025722,0.087392,0.326078,0.080665,0.158828,0.118065,0.357558,0.754018,True,
1,Alabama,Baldwin,223234,1003,185747,19215,2346,10534,1589.784,-87.723954,...,0.027063,0.070291,0.276068,0.093606,0.207029,0.10643,0.407064,0.798123,True,
2,Alabama,Barbour,24686,1005,11235,11807,116,1117,884.876,-85.389245,...,0.08199,0.187696,0.356903,0.070379,0.077973,0.044186,0.192538,0.528359,True,
3,Alabama,Bibb,22394,1007,16663,4719,46,623,622.582,-87.124963,...,0.057224,0.11071,0.473447,0.057541,0.075856,0.039037,0.172433,0.78227,True,
4,Alabama,Blount,57826,1009,50176,872,163,5582,644.776,-86.569756,...,0.074873,0.123502,0.340399,0.120499,0.081182,0.045247,0.246928,0.913855,True,


In [None]:
X = ct.fit_transform(group[cols])
n = nclusters[state] + p
if (n > 1) and (n < len(group)):
    ac = AgglomerativeClustering(n_clusters=n).fit(X)
    clusters = ac.labels_
    sil = metrics.silhouette_score(X, clusters, metric='euclidean')
else:
    clusters = np.arange(0, len(group))
    sil = np.nan
sils.append(sil)

In [100]:
test = pop_df.copy()
grouped = test.groupby(by='state')
cols = ['lon', 'lat', 'total_pop', 'pop_density', 'per_white', 'per_black', 'per_asian', 
        'median_income', 'per_votes_gop', 
        'per_no_hs', 'per_some_hs', 'per_hs', 'per_associates', 'per_bachelors', 'per_graduate']

class SklearnWrapper:
    def __init__(self, transformation):
        self.transformation = transformation
        self._group_transforms = []
        # Start with -1 and for each group up the pointer by one
        self._pointer = -1

    def _call_with_function(self, df: pd.DataFrame, function: str):
        # If pointer >= len we are making a new apply, reset _pointer
        if self._pointer >= len(self._group_transforms):
            self._pointer = -1
        self._pointer += 1
        return pd.DataFrame(
            getattr(self._group_transforms[self._pointer], function)(df.values),
            columns=df.columns,
            index=df.index,
        )

    def fit(self, df):
        self._group_transforms.append(self.transformation.fit(df.values))
        return self

    def transform(self, df):
        return self._call_with_function(df, "transform")

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

    def inverse_transform(self, df):
        return self._call_with_function(df, "inverse_transform")

# Future Work: import Puerto Rico census data

To do:
- find detailed demographic data for Puerto Rico
- find a way to incorporate Puerto Rico into the Altair map

In [None]:
# with urlopen('https://www2.census.gov/programs-surveys/popest/tables/2010-2019/municipios/totals/prm-est2019-annres.xlsx') as response:
#     pr_df = pd.read_excel(response, header=3)
pr_df = pd.read_excel('data/prm-est2019-annres.xlsx', header=3)
pr_df = pr_df[['Unnamed: 0', 2019]]
pr_df.rename(
    columns={
        'Unnamed: 0':'county',
        2019:'total_pop'
    }, inplace=True
)
pr_df = pr_df[~pr_df['total_pop'].isna()]
pr_df['total_pop'] = pr_df['total_pop'].astype('int')
pr_df.head()

In [None]:
pr_df['county'] = [s[0] if len(s) > 0 else s for s in pr_df['county'].str.findall("\.([\w\s]+) Municipio\,.+")]
pr_df = pr_df.iloc[1:]          # removing the territory as a whole from the table
pr_df.head()

We also need to add `fips` codes for all of the municipios.

### import Puerto Rico `fips`

In [None]:
sess = HTMLSession()
res = sess.get('https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county')
table = res.html.find('table.wikitable > tbody > tr')
# puerto rico is fips 72
pr_fips = [[tr.find('td')[1].text, tr.find('td')[0].text] for tr in table[1:] if tr.find('td')[0].text[:2] == '72']
pr_fips_df = pd.DataFrame(pr_fips)
pr_fips_df.rename(
    columns={
        0:'county',
        1:'fips'
    }, inplace=True
)
pr_fips_df.head()

In [None]:
pr_fips_df['county'] = [s[0] if len(s) > 0 else s for s in pr_fips_df['county'].str.findall("([\w\s]+) Municipality")]
pr_fips_df.head()

In [None]:
len(list(set(pr_fips_df['county']) - set(pr_df['county'])))

In [None]:
pr_df = pr_df.merge(pr_fips_df, on='county')
pr_df['state'] = 'Puerto Rico'
pr_df.head()

In [None]:
pop_df = optimize(pop_df.append(pr_df, ignore_index=True).append(pr_df, ignore_index=True))
pop_df.tail()

## check county names against NYTimes data (again)

In [None]:
county_diffs = list(set(nyt_df['county']) - set(dem_df['county']))
len(county_diffs)

In [None]:
county_diffs

The NYTimes dataset is missing diacritical marks in their names. While it would be easier to replace diacritical marks with their "standard" character counterparts, we will preserve them in our final dataframe in the interest of cultural accuracy. This will be handled when we merge `pop_df` with `nyt_df` in the other notebook.