# Census and GeoJSON Data EDA

In [61]:
import numpy as np
import pandas as pd

from urllib.request import urlopen
# from requests_html import HTMLSession
import json

import itertools

# import re

from time import time
from datetime import datetime, timedelta

from shapely.geometry import Polygon

In [62]:
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    return dft

# 1. import census data from `census.gov`

Demographic data taken from [census.gov](https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/asrh/).

Since 2020 Census data have not been released yet, we will use 2019 population estimates.

Looking at the [data dictionary](https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/cc-est2019-alldata.pdf), we will only save the data from `YEAR == 12`.

In [71]:
with urlopen('https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/asrh/cc-est2019-alldata.csv') as response:
    dem_df = pd.read_csv(
        response,
        encoding='latin-1',        # to avoid unicode error
        dtype={'STATE':'str',
               'COUNTY':'str'}
    )
dem_df = dem_df.drop(columns='SUMLEV')         # SUMLEV == 50 for the 50 US states
dem_df = dem_df.loc[(dem_df['YEAR'] == 12)]    # population estimate for 2019
dem_df.head()

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,...,HWAC_MALE,HWAC_FEMALE,HBAC_MALE,HBAC_FEMALE,HIAC_MALE,HIAC_FEMALE,HAAC_MALE,HAAC_FEMALE,HNAC_MALE,HNAC_FEMALE
209,1,1,Alabama,Autauga County,12,0,55869,27092,28777,20878,...,778,687,89,93,40,27,15,19,16,11
210,1,1,Alabama,Autauga County,12,1,3277,1713,1564,1249,...,76,53,10,6,6,5,3,4,3,3
211,1,1,Alabama,Autauga County,12,2,3465,1787,1678,1287,...,83,59,2,10,8,2,2,0,1,1
212,1,1,Alabama,Autauga County,12,3,3851,1977,1874,1441,...,84,67,11,12,2,2,1,2,2,1
213,1,1,Alabama,Autauga County,12,4,3659,1854,1805,1341,...,55,68,7,6,4,5,0,4,3,0


In [72]:
dem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59698 entries, 209 to 716375
Data columns (total 79 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   STATE         59698 non-null  object
 1   COUNTY        59698 non-null  object
 2   STNAME        59698 non-null  object
 3   CTYNAME       59698 non-null  object
 4   YEAR          59698 non-null  int64 
 5   AGEGRP        59698 non-null  int64 
 6   TOT_POP       59698 non-null  int64 
 7   TOT_MALE      59698 non-null  int64 
 8   TOT_FEMALE    59698 non-null  int64 
 9   WA_MALE       59698 non-null  int64 
 10  WA_FEMALE     59698 non-null  int64 
 11  BA_MALE       59698 non-null  int64 
 12  BA_FEMALE     59698 non-null  int64 
 13  IA_MALE       59698 non-null  int64 
 14  IA_FEMALE     59698 non-null  int64 
 15  AA_MALE       59698 non-null  int64 
 16  AA_FEMALE     59698 non-null  int64 
 17  NA_MALE       59698 non-null  int64 
 18  NA_FEMALE     59698 non-null  int64 
 19  T

Notice that county names provided by the US census contain descriptive terms, such as 'County', whereas the NYTimes data does not.

In [73]:
# rename columns to better-match nytimes data (and personal preference)
dem_df.rename(
    columns={
        'STATE':'statefips',
        'COUNTY':'countyfips',
        'STNAME':'state',
        'CTYNAME':'county',
    }, inplace=True
)
dem_df.columns = dem_df.columns.str.lower()

# create fips column
dem_df['fips'] = dem_df['statefips'] + dem_df['countyfips']
dem_df = dem_df.drop(columns=['statefips', 'countyfips'])

# remove descriptive terms from county names
county_terms = ['County', 'Parish', 'Municipality']
for term in county_terms:
    dem_df['county'] = dem_df['county'].str.replace(' ' + term, '')
    
dem_df.head()

Unnamed: 0,state,county,year,agegrp,tot_pop,tot_male,tot_female,wa_male,wa_female,ba_male,...,hwac_female,hbac_male,hbac_female,hiac_male,hiac_female,haac_male,haac_female,hnac_male,hnac_female,fips
209,Alabama,Autauga,12,0,55869,27092,28777,20878,21729,5237,...,687,89,93,40,27,15,19,16,11,1001
210,Alabama,Autauga,12,1,3277,1713,1564,1249,1117,339,...,53,10,6,6,5,3,4,3,3,1001
211,Alabama,Autauga,12,2,3465,1787,1678,1287,1191,388,...,59,2,10,8,2,2,0,1,1,1001
212,Alabama,Autauga,12,3,3851,1977,1874,1441,1350,442,...,67,11,12,2,2,1,2,2,1,1001
213,Alabama,Autauga,12,4,3659,1854,1805,1341,1334,434,...,68,7,6,4,5,0,4,3,0,1001


## check county names against NYTimes data

We eventually need to merge `nyt_df` and `pop_df`, so let's see how they match with each other:

In [74]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = optimize(pd.read_csv(
        response,
        dtype={'fips':'str'}
    ))
nyt_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [75]:
county_diffs = list(set(nyt_df['fips']) - set(dem_df['fips']))
len(county_diffs)

81

In [10]:
sorted([str(f) for f in county_diffs])

['69110',
 '69120',
 '72001',
 '72003',
 '72005',
 '72007',
 '72009',
 '72011',
 '72013',
 '72015',
 '72017',
 '72019',
 '72021',
 '72023',
 '72025',
 '72027',
 '72029',
 '72031',
 '72033',
 '72035',
 '72037',
 '72039',
 '72041',
 '72043',
 '72045',
 '72047',
 '72049',
 '72051',
 '72053',
 '72054',
 '72055',
 '72057',
 '72059',
 '72061',
 '72063',
 '72065',
 '72067',
 '72069',
 '72071',
 '72073',
 '72075',
 '72077',
 '72079',
 '72081',
 '72083',
 '72085',
 '72087',
 '72089',
 '72091',
 '72093',
 '72095',
 '72097',
 '72099',
 '72101',
 '72103',
 '72105',
 '72107',
 '72109',
 '72111',
 '72113',
 '72115',
 '72117',
 '72119',
 '72121',
 '72123',
 '72125',
 '72127',
 '72129',
 '72131',
 '72133',
 '72135',
 '72137',
 '72139',
 '72141',
 '72143',
 '72145',
 '72147',
 '72149',
 '72151',
 '72153',
 'nan']

As expected, the census county data is missing all municipios from [Puerto Rico](https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html) (`fips == 72`) as well as a couple from the Northern Mariana Islands (`fips == 69`), so we need to append that data to `pop_df`.

## truncated `pop_df` with just `tot_pop`

Since the NYTimes dataset treats `New York City`, `Kansas City`, and `Joplin` as their own entities, we need to add them to `pop_df`.

`New York City` is the combination of these five counties, which are coterminous with the five boroughs:

- Bronx
- Kings
- New York
- Queens
- Richmond

Additional information taken from [census.gov quickfacts]('https://www.census.gov/quickfacts') for `Kansas City` and `Joplin`.

We'll use `'nyc'`, `'kc'`, and `'jm'` as our `fips` for these three cities.

In [76]:
boroughs = ['Bronx', 'Kings', 'New York', 'Queens', 'Richmond']

nyc_dem_df = dem_df[(dem_df['state'] == 'New York') & (dem_df['county'] == boroughs[0])].select_dtypes(include='number')

for b in boroughs[1:]:
    nyc_dem_df += dem_df[(dem_df['state'] == 'New York') & (dem_df['county'] == b)].select_dtypes(include='number').values

# nyc_dem_df = nyc_dem_df.astype('int')
nyc_dem_df[['year', 'agegrp']] //= 5
nyc_dem_df['state'] = 'New York'
nyc_dem_df['county'] = 'New York City'
nyc_dem_df['fips'] = 'nyc'
nyc_dem_df.head()

Unnamed: 0,year,agegrp,tot_pop,tot_male,tot_female,wa_male,wa_female,ba_male,ba_female,ia_male,...,hbac_female,hiac_male,hiac_female,haac_male,haac_female,hnac_male,hnac_female,state,county,fips
417449,12,0,8336817,3978439,4358378,2145238,2247804,1046937,1247761,57897,...,294492,65574,67225,21187,22500,8967,9468,New York,New York City,nyc
417450,12,1,523718,268169,255549,140750,134037,71305,69075,2692,...,20889,4011,3809,2387,2370,768,750,New York,New York City,nyc
417451,12,2,484313,247453,236860,126234,120716,69981,67787,3195,...,21664,4352,4296,1869,1774,592,527,New York,New York City,nyc
417452,12,3,443786,226531,217255,113307,108563,69007,67426,4608,...,19396,5006,4704,1719,1564,628,605,New York,New York City,nyc
417453,12,4,439764,221600,218164,110652,107743,68131,68286,4450,...,18725,4747,4686,1457,1472,662,593,New York,New York City,nyc


In [77]:
dem_df = optimize(dem_df.append(nyc_dem_df, ignore_index=True))
# pop_df.loc[pop_df['year'].isna(), 'year'] = int(12)
# pop_df.loc[pop_df['agegrp'].isna(), 'agegrp'] = int(0)
dem_df[dem_df['fips'] == 'nyc'].head()

Unnamed: 0,state,county,year,agegrp,tot_pop,tot_male,tot_female,wa_male,wa_female,ba_male,...,hwac_female,hbac_male,hbac_female,hiac_male,hiac_female,haac_male,haac_female,hnac_male,hnac_female,fips
59698,New York,New York City,12,0,8336817,3978439,4358378,2145238,2247804,1046937,...,922001,257362,294492,65574,67225,21187,22500,8967,9468,nyc
59699,New York,New York City,12,1,523718,268169,255549,140750,134037,71305,...,64749,21801,20889,4011,3809,2387,2370,768,750,nyc
59700,New York,New York City,12,2,484313,247453,236860,126234,120716,69981,...,63336,22523,21664,4352,4296,1869,1774,592,527,nyc
59701,New York,New York City,12,3,443786,226531,217255,113307,108563,69007,...,55336,20348,19396,5006,4704,1719,1564,628,605,nyc
59702,New York,New York City,12,4,439764,221600,218164,110652,107743,68131,...,55518,19338,18725,4747,4686,1457,1472,662,593,nyc


In [78]:
dem_df['per_white'] = (dem_df['wa_male']+dem_df['wa_female']) / dem_df['tot_pop']

In [79]:
pop_df = dem_df[dem_df['agegrp'] == 0][['state', 'county', 'tot_pop', 'fips', 'per_white']]
pop_df[pop_df['fips'] == 'nyc']

Unnamed: 0,state,county,tot_pop,fips,per_white
59698,New York,New York City,8336817,nyc,0.526945


In [81]:
pop_df2 = pd.DataFrame(
    [['Missouri',
      'Kansas City',
      495_327 + 152_960,
      'kc'],
     ['Missouri',
      'Joplin',
      50_925,
      'jm']]
    , columns=['state', 'county', 'tot_pop', 'fips'])
pop_df2

Unnamed: 0,state,county,tot_pop,fips
0,Missouri,Kansas City,648287,kc
1,Missouri,Joplin,50925,jm


In [82]:
pop_df = optimize(pop_df.append(pop_df2, ignore_index=True))
pop_df.tail(5)

Unnamed: 0,state,county,tot_pop,fips,per_white
3140,Wyoming,Washakie,7805,56043,0.945163
3141,Wyoming,Weston,6927,56045,0.931716
3142,New York,New York City,8336817,nyc,0.526945
3143,Missouri,Kansas City,648287,kc,
3144,Missouri,Joplin,50925,jm,


## save results to csv

In [83]:
dem_df.to_csv('data/dem_df.csv', index=False)
pop_df.to_csv('data/pop_df.csv', index=False)

# 2. import geojson for boundaries and census areas

In [23]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    county_json = json.load(response)

In [24]:
county_json['features'][0]

{'type': 'Feature',
 'properties': {'GEO_ID': '0500000US01001',
  'STATE': '01',
  'COUNTY': '001',
  'NAME': 'Autauga',
  'LSAD': 'County',
  'CENSUSAREA': 594.436},
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-86.496774, 32.344437],
    [-86.717897, 32.402814],
    [-86.814912, 32.340803],
    [-86.890581, 32.502974],
    [-86.917595, 32.664169],
    [-86.71339, 32.661732],
    [-86.714219, 32.705694],
    [-86.413116, 32.707386],
    [-86.411172, 32.409937],
    [-86.496774, 32.344437]]]},
 'id': '01001'}

In [25]:
fips_to_add_to_json = list(set(nyt_df['fips']) - set([f['id'] for f in county_json['features']]))
fips_to_add_to_json

[nan, '69120', '46102', '69110', '02158']

The `plotly` county GeoJSON dataset is missing Kusilvak Census Area (`'02158'`) and Oglala Lakota County(`'46102'`), in addition to the three cities included in the NYTimes data (New York City, Kansas City, Joplin). GeoJSON data for these five areas compiled from [nomanatim](https://nominatim.openstreetmap.org/) and [polygons](http://polygons.openstreetmap.fr/). We will ignore the entries for the Northern Mariana Islands for now.

- Search for the area at [nomanatim](https://nominatim.openstreetmap.org/).
- Select `details` from the relevant entry.
- Copy the numeric `code` under `OSM`, ignoring "relation". Eg. for New York City, copy `175905`.
- Search for the `code` at [polygons](http://polygons.openstreetmap.fr/).
- For our purposes, GeoJSONs were selected according to the following criteria: (1) sparsity of vertices (`NPoints`) and (2) accuracy of shape.

In [26]:
# new york city, ny
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/nyc.txt') as response:
    nyc_json = json.load(response)

# kansas city, mo/ks
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kcm.txt') as response:
    kcm_json = json.load(response)
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kck.txt') as response:
    kck_json = json.load(response)
kc_json = dict(
    coordinates = kcm_json['coordinates'] + kck_json['coordinates']
)

# joplin, mo
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/jm.txt') as response:
    jm_json = json.load(response)

# oglala lakota county, nd
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/olsd.txt') as response:
    olsd_json = json.load(response)

# kusilvak census area, ak
with urlopen('https://raw.githubusercontent.com/jydiw/nyt-covid-19-data/master/data/kca.txt') as response:
    kca_json = json.load(response)

In [27]:
# https://stackoverflow.com/questions/41271146/
def clean_coordinates(c):
    return [list(itertools.chain(*d)) for d in c]

In [29]:
add_to_json_dict = {
    '02158':{'area':17_081.43,
             'name':'Kusilvak Census Area',
             'coordinates':clean_coordinates(kca_json['coordinates'])}, 
    '46102':{'area':2_093.90,
             'name':'Oglala Lakota',
             'coordinates':clean_coordinates(olsd_json['coordinates'])},
    'jm':{'area':35.56,
          'name':'Joplin',
          'coordinates':clean_coordinates(jm_json['geometries'][0]['coordinates'])},
    'kc':{'area':124.81+314.95,
          'name':'Kansas City',
          'coordinates':clean_coordinates(kc_json['coordinates'])},
    'nyc':{'area':302.64,
           'name':'New York City',
           'coordinates':clean_coordinates(nyc_json['coordinates'])}
}

In [30]:
for fips in ['02158', '46102', 'jm', 'kc', 'nyc']:
    county_json['features'].append(
        {
            'geometry': {'coordinates': add_to_json_dict[fips]['coordinates'],
                         'type': 'Polygon'},
            'id': fips,
            'properties': {'NAME': add_to_json_dict[fips]['name'],
                           'CENSUSAREA': add_to_json_dict[fips]['area']},
            'type': 'Feature'
        }
    )

In [31]:
with open('data/county_json.json', 'w') as f:
    json.dump(county_json, f)

## add centroid latitude and longitude coordinates and county area to `pop_df`

In [32]:
def centroid(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            shapes = np.array(d['geometry']['coordinates']).flatten()
            try:
                areas = [Polygon(shape).area for shape in shapes]
                p = Polygon(shapes[areas.index(max(areas))])
                lon, lat = p.centroid.coords[0]
            except:
                shapes = np.reshape(shapes, (-1, 2))
                p = Polygon(shapes)
                lon, lat = p.centroid.coords[0]
            return lon, lat

In [33]:
def county_area(i, j=county_json):
    for d in j['features']:
        if d['id'] == i:
            return d['properties']['CENSUSAREA']

In [84]:
tick = time()
pop_df['area'] = pop_df['fips'].apply(county_area)
pop_df['lon'], pop_df['lat'] = zip(*pop_df['fips'].apply(centroid).to_list())
pop_df = optimize(pop_df)
tock = time()
print(tock - tick)

0.7001566886901855


In [85]:
pop_df.head()

Unnamed: 0,state,county,tot_pop,fips,per_white,area,lon,lat
0,Alabama,Autauga,55869,1001,0.762623,594.435974,-86.641197,32.536152
1,Alabama,Baldwin,223234,1003,0.87441,1589.784058,-87.723953,30.725863
2,Alabama,Barbour,24686,1005,0.491534,884.875977,-85.389244,31.867889
3,Alabama,Bibb,22394,1007,0.767661,622.58197,-87.124962,32.996456
4,Alabama,Blount,57826,1009,0.958254,644.776001,-86.569756,33.985249


## engineer population density column

In [86]:
pop_df['pop_per_area'] = pop_df['tot_pop'] / pop_df['area']
pop_df.head()

Unnamed: 0,state,county,tot_pop,fips,per_white,area,lon,lat,pop_per_area
0,Alabama,Autauga,55869,1001,0.762623,594.435974,-86.641197,32.536152,93.986573
1,Alabama,Baldwin,223234,1003,0.87441,1589.784058,-87.723953,30.725863,140.417813
2,Alabama,Barbour,24686,1005,0.491534,884.875977,-85.389244,31.867889,27.897695
3,Alabama,Bibb,22394,1007,0.767661,622.58197,-87.124962,32.996456,35.969561
4,Alabama,Blount,57826,1009,0.958254,644.776001,-86.569756,33.985249,89.683859


## add elections data

Mask compliance has been very political, so it would be interesting to see how political differences vary by county. Data taken from [github.com/tonmcg](https://github.com/tonmcg). Alaska data taken from [RRH Elections](https://rrhelections.com/index.php/2018/02/02/alaska-results-by-county-equivalent-1960-2016/).

In [43]:
with urlopen('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-16/master/2016_US_County_Level_Presidential_Results.csv') as response:
    elect_df = pd.read_csv(
        response,
        encoding='latin-1',        # to avoid unicode error
        dtype={
            'votes_dem':'int',
            'votes_gop':'int',
            'total_votes':'int',
            'combined_fips':'str'},
        index_col=0
    )
elect_df.head()

Unnamed: 0,votes_dem,votes_gop,total_votes,per_dem,per_gop,diff,per_point_diff,state_abbr,county_name,combined_fips
0,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2013
1,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2016
2,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2020
3,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2050
4,93003,130413,246588,0.377159,0.52887,37410,15.17%,AK,Alaska,2060


In [44]:
elect_df.rename(
    columns={
        'county_name':'county',
        'combined_fips':'fips',
    }, inplace=True
)

# https://stackoverflow.com/a/23836353
elect_df['fips'] = elect_df['fips'].apply('{0:0>5}'.format)

for term in county_terms:
    elect_df['county'] = elect_df['county'].str.replace(' ' + term, '')

In [45]:
elect_df = elect_df[['state_abbr', 'county', 'fips', 'votes_dem', 'votes_gop', 'per_gop', 'per_dem', 'total_votes']]
elect_df = elect_df.sort_values(by='fips')
elect_df.head()

Unnamed: 0,state_abbr,county,fips,votes_dem,votes_gop,per_gop,per_dem,total_votes
29,AL,Autauga,1001,5908,18110,0.734358,0.239569,24661
30,AL,Baldwin,1003,18409,72780,0.773515,0.195653,94090
31,AL,Barbour,1005,4848,5431,0.522714,0.466603,10390
32,AL,Bibb,1007,1874,6733,0.769662,0.21422,8748
33,AL,Blount,1009,2150,22808,0.898519,0.084699,25384


### add new york city election data

In [46]:
nyc_elect_df = elect_df[(elect_df['state_abbr'] == 'NY') & (elect_df['county'] == boroughs[0])].select_dtypes(include='number')

for b in boroughs[1:]:
    nyc_elect_df += elect_df[(elect_df['state_abbr'] == 'NY') & (elect_df['county'] == b)].select_dtypes(include='number').values
    
nyc_elect_df

Unnamed: 0,votes_dem,votes_gop,per_gop,per_dem,total_votes
1982,1969920.0,461174.0,1.167451,3.715365,2490750.0


In [47]:
nyc_elect_df[['votes_dem', 'votes_gop', 'total_votes']] = nyc_elect_df[['votes_dem', 'votes_gop', 'total_votes']].astype('int')

In [48]:
nyc_elect_df['per_dem'] = nyc_elect_df['votes_dem'] / nyc_elect_df['total_votes']
nyc_elect_df['per_gop'] = nyc_elect_df['votes_gop'] / nyc_elect_df['total_votes']

In [49]:
nyc_elect_df['state_abbr'] = 'NY'
nyc_elect_df['county'] = 'New York City'
nyc_elect_df['fips'] = 'nyc'
nyc_elect_df.head()

Unnamed: 0,votes_dem,votes_gop,per_gop,per_dem,total_votes,state_abbr,county,fips
1982,1969920,461174,0.185155,0.790894,2490750,NY,New York City,nyc


In [50]:
elect_df = elect_df.append(nyc_elect_df, ignore_index=True)

### add alaska elections data

In [51]:
# with urlopen('https://www.elections.alaska.gov/results/16GENR/data/resultsbyprct.txt') as response:
#     ak_elect_df = pd.read_csv(
#         response,
#         encoding='latin-1',
#         names=['district-name', 'category', 'measure', 'party', 'total', 'count', 'x']
#     )
# ak_elect_df.head()

ak_elect_df = pd.read_excel('data/2016 AK Gen Official.xlsx', sheet_name='By CE')
ak_elect_df = ak_elect_df.iloc[0:29, 0:12]
ak_elect_df.head()

Unnamed: 0,ED/Muni,Municipality Code,Registered Voters,Times Counted,"Castle, Darrell L.","Clinton, Hillary","De La Fuente, Roque","Johnson, Gary","Stein, Jill","Trump, Donald J.",Write-in 60,ED Total
0,Ketchikan Gateway,Ketchikan,10512,4283,48,1295,13,339,84,2354,104,4237
1,Prince of Wales-Hyder,Prince of Wales-Hyder,4630,1831,67,666,29,93,65,831,59,1810
2,Sitka,Sitka,7218,2787,38,1261,18,145,78,1146,73,2759
3,Petersburg,Petersburg,2741,1078,12,334,7,64,37,577,32,1063
4,Wrangell,Wrangell,1731,764,7,177,3,35,13,512,13,760


In [52]:
ak_elect_df['per_gop'] = ak_elect_df['Trump, Donald J. '] / ak_elect_df['ED Total']
ak_elect_df['per_dem'] = ak_elect_df['Clinton, Hillary '] / ak_elect_df['ED Total']
ak_elect_df = ak_elect_df[['ED/Muni', 'per_gop', 'per_dem', 'ED Total']].sort_values(by='ED/Muni')
ak_elect_df = ak_elect_df.sort_values(by='ED/Muni')
ak_elect_df.head()

Unnamed: 0,ED/Muni,per_gop,per_dem,ED Total
22,Aleutians East,0.536585,0.327913,369
24,Aleutians West,0.307329,0.582742,846
19,Anchorage,0.489018,0.393374,81678
12,Bethel,0.205695,0.553776,3933
25,Bristol Bay,0.56962,0.313291,316


In [53]:
print(len(ak_elect_df))
print(len(elect_df[elect_df['state_abbr'] == 'AK']))

29
29


In [54]:
elect_df.loc[elect_df['state_abbr'] == 'AK', ['per_gop', 'per_dem', 'total_votes']] = ak_elect_df[['per_gop', 'per_dem', 'ED Total']].values

In [55]:
elect_df = elect_df.drop(columns=['votes_dem', 'votes_gop'])
elect_df[elect_df['fips'] == 'nyc']

Unnamed: 0,state_abbr,county,fips,per_gop,per_dem,total_votes
3141,NY,New York City,nyc,0.185155,0.790894,2490750


In [87]:
pop_df = pop_df.merge(elect_df[['fips', 'per_gop', 'per_dem', 'total_votes']], on='fips', how='left')

In [88]:
pop_df['per_votes'] = pop_df['total_votes'] / pop_df['tot_pop']
pop_df.head()

Unnamed: 0,state,county,tot_pop,fips,per_white,area,lon,lat,pop_per_area,per_gop,per_dem,total_votes,per_votes
0,Alabama,Autauga,55869,1001,0.762623,594.435974,-86.641197,32.536152,93.986573,0.734358,0.239569,24661.0,0.441408
1,Alabama,Baldwin,223234,1003,0.87441,1589.784058,-87.723953,30.725863,140.417813,0.773515,0.195653,94090.0,0.421486
2,Alabama,Barbour,24686,1005,0.491534,884.875977,-85.389244,31.867889,27.897695,0.522714,0.466603,10390.0,0.420886
3,Alabama,Bibb,22394,1007,0.767661,622.58197,-87.124962,32.996456,35.969561,0.769662,0.21422,8748.0,0.39064
4,Alabama,Blount,57826,1009,0.958254,644.776001,-86.569756,33.985249,89.683859,0.898519,0.084699,25384.0,0.438972


In [89]:
pop_df.to_csv('data/pop_df.csv', index=False)

## Future Work: import Puerto Rico census data

To do:
- find detailed demographic data for Puerto Rico
- find a way to incorporate Puerto Rico into the Altair map

In [None]:
# with urlopen('https://www2.census.gov/programs-surveys/popest/tables/2010-2019/municipios/totals/prm-est2019-annres.xlsx') as response:
#     pr_df = pd.read_excel(response, header=3)
pr_df = pd.read_excel('data/prm-est2019-annres.xlsx', header=3)
pr_df = pr_df[['Unnamed: 0', 2019]]
pr_df.rename(
    columns={
        'Unnamed: 0':'county',
        2019:'tot_pop'
    }, inplace=True
)
pr_df = pr_df[~pr_df['tot_pop'].isna()]
pr_df['tot_pop'] = pr_df['tot_pop'].astype('int')
pr_df.head()

In [None]:
pr_df['county'] = [s[0] if len(s) > 0 else s for s in pr_df['county'].str.findall("\.([\w\s]+) Municipio\,.+")]
pr_df = pr_df.iloc[1:]          # removing the territory as a whole from the table
pr_df.head()

We also need to add `fips` codes for all of the municipios.

### import Puerto Rico `fips`

In [None]:
sess = HTMLSession()
res = sess.get('https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county')
table = res.html.find('table.wikitable > tbody > tr')
# puerto rico is fips 72
pr_fips = [[tr.find('td')[1].text, tr.find('td')[0].text] for tr in table[1:] if tr.find('td')[0].text[:2] == '72']
pr_fips_df = pd.DataFrame(pr_fips)
pr_fips_df.rename(
    columns={
        0:'county',
        1:'fips'
    }, inplace=True
)
pr_fips_df.head()

In [None]:
pr_fips_df['county'] = [s[0] if len(s) > 0 else s for s in pr_fips_df['county'].str.findall("([\w\s]+) Municipality")]
pr_fips_df.head()

In [None]:
len(list(set(pr_fips_df['county']) - set(pr_df['county'])))

In [None]:
pr_df = pr_df.merge(pr_fips_df, on='county')
pr_df['state'] = 'Puerto Rico'
pr_df.head()

In [None]:
pop_df = optimize(pop_df.append(pr_df, ignore_index=True).append(pr_df, ignore_index=True))
pop_df.tail()

## check county names against NYTimes data (again)

In [None]:
county_diffs = list(set(nyt_df['county']) - set(dem_df['county']))
len(county_diffs)

In [None]:
county_diffs

The NYTimes dataset is missing diacritical marks in their names. While it would be easier to replace diacritical marks with their "standard" character counterparts, we will preserve them in our final dataframe in the interest of cultural accuracy. This will be handled when we merge `pop_df` with `nyt_df` in the other notebook.