# Census and GeoJSON Data EDA

The goal of this notebook is to obtain and organize the following county-level data:

- nominal data: state, county, fips
- census data: 
    - total population
    - ethnic population(s)
    - voting statistics
    - median income
    - educational attainment
- geographic data (from GeoJSON): 
    - census area
    - latitude/longitude

The statistics gathered in this notebook will only need to be updated once the 2020 Census information is released to the public.

In [1]:
# standard EDA
import numpy as np
import pandas as pd

# processing geodata
import geopandas as gp
import pickle                           # saving to pickle instead of csv
from scipy import sparse
from shapely.geometry import asShape, Polygon

# opening external coordinates
import json

# opening urls
from urllib.request import urlopen

pd.options.display.max_rows = 150
pd.options.display.max_columns = 150

# 1. import census data from `census.gov`

2019 population estimates can be collected from [census.gov](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html). For the most current estimates, we will only save data from `YEAR == 12` and `AGEGRP == 0` ([data dictionary](https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/cc-est2019-alldata.pdf)).

In [2]:
raw_eth_cols = ['TOT', 'NHWA', 'NHBA', 'NHIA', 'NHAA', 'NHNA', 'NHTOM', 'H']
sex_cols = ['_MALE', '_FEMALE']
es_cols = [e+s for e in raw_eth_cols for s in sex_cols]

pop_cols = ['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'YEAR', 'AGEGRP', 'TOT_POP']\
           + es_cols

pop_df = pd.read_csv(
    '../data/external/cc-est2019-alldata.csv',
    encoding='latin-1',        # to avoid unicode error
    usecols=pop_cols,          # it's a big file, only import certain columns
    dtype={'STATE':'str',      # these are FIPS codes
           'COUNTY':'str'},
)

# mask for 2019 estimates (12)
pop_df = pop_df.loc[(pop_df['YEAR'] == 12)]    
pop_df = pop_df.drop(columns=['YEAR'])

# rename columns to better-match nytimes data (and personal preference)
pop_df = pop_df.rename(
    columns={
        'STATE':'state_fips',
        'COUNTY':'county_fips',
        'STNAME':'state',
        'CTYNAME':'county',
    }
)

# not sure if we need this level of granularity but we can keep it for now
eth_cols = ['tot_pop_white', 'tot_pop_black', 'tot_pop_native', 'tot_pop_asian', 
            'tot_pop_pacific', 'tot_pop_twoplus', 'tot_pop_hispanic']
es_cols_2 = [(e+s).lower() for e in eth_cols for s in sex_cols]
pop_df = pop_df.rename(
    columns=dict(zip(es_cols[2:], es_cols_2))
)

pop_df.columns = pop_df.columns.str.lower()

# nytimes fips is 5-digit combo of state and county fips
pop_df['fips'] = pop_df['state_fips'] + pop_df['county_fips']
pop_df = pop_df.drop(columns=['county_fips'])

pop_df = pop_df.reset_index(drop=True)

pop_df.head()

Unnamed: 0,state_fips,state,county,agegrp,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,fips
0,1,Alabama,Autauga County,0,55869,27092,28777,20138,21077,5171,5927,105,138,282,364,20,20,492,464,884,787,1001
1,1,Alabama,Autauga County,1,3277,1713,1564,1180,1072,334,340,3,6,23,19,2,3,85,64,86,60,1001
2,1,Alabama,Autauga County,2,3465,1787,1678,1210,1134,388,359,7,8,16,25,0,1,78,81,88,70,1001
3,1,Alabama,Autauga County,3,3851,1977,1874,1362,1285,435,409,3,9,17,24,0,3,66,65,94,79,1001
4,1,Alabama,Autauga County,4,3659,1854,1805,1291,1272,429,397,4,0,21,13,3,3,43,46,63,74,1001


In [3]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59698 entries, 0 to 59697
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   state_fips               59698 non-null  object
 1   state                    59698 non-null  object
 2   county                   59698 non-null  object
 3   agegrp                   59698 non-null  int64 
 4   tot_pop                  59698 non-null  int64 
 5   tot_male                 59698 non-null  int64 
 6   tot_female               59698 non-null  int64 
 7   tot_pop_white_male       59698 non-null  int64 
 8   tot_pop_white_female     59698 non-null  int64 
 9   tot_pop_black_male       59698 non-null  int64 
 10  tot_pop_black_female     59698 non-null  int64 
 11  tot_pop_native_male      59698 non-null  int64 
 12  tot_pop_native_female    59698 non-null  int64 
 13  tot_pop_asian_male       59698 non-null  int64 
 14  tot_pop_asian_female     59698 non-nul

In [4]:
# remove descriptive terms from county names
# we'll use this again so it's nice to have a function
def remove_county_terms(s):
    county_terms = ['County', 'Parish', 'Municipality']
    for term in county_terms:
        s = s.str.replace(' ' + term, '')
    return s

pop_df[['county']] = pop_df[['county']].apply(remove_county_terms)

# personally like ordinal columns listed first
pop_cols = pop_df.select_dtypes(exclude='number').columns.tolist()\
           + pop_df.select_dtypes(include='number').columns.tolist()
pop_df = pop_df[pop_cols]
pop_df.head()

Unnamed: 0,state_fips,state,county,fips,agegrp,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female
0,1,Alabama,Autauga,1001,0,55869,27092,28777,20138,21077,5171,5927,105,138,282,364,20,20,492,464,884,787
1,1,Alabama,Autauga,1001,1,3277,1713,1564,1180,1072,334,340,3,6,23,19,2,3,85,64,86,60
2,1,Alabama,Autauga,1001,2,3465,1787,1678,1210,1134,388,359,7,8,16,25,0,1,78,81,88,70
3,1,Alabama,Autauga,1001,3,3851,1977,1874,1362,1285,435,409,3,9,17,24,0,3,66,65,94,79
4,1,Alabama,Autauga,1001,4,3659,1854,1805,1291,1272,429,397,4,0,21,13,3,3,43,46,63,74


In [5]:
# check to see if we have all ethnic groups covered
(pop_df.iloc[:, 8:].sum(axis=1) / pop_df['tot_pop']).describe()

count    59690.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
dtype: float64

In [6]:
# add sex-aggregated columns
eth_cols_3 = [e.lower() for e in eth_cols]

for e in eth_cols_3:
    pop_df.loc[:, e] =\
    pop_df.loc[:, e+'_male'] + pop_df.loc[:, e+'_female']
    
pop_df.columns

Index(['state_fips', 'state', 'county', 'fips', 'agegrp', 'tot_pop',
       'tot_male', 'tot_female', 'tot_pop_white_male', 'tot_pop_white_female',
       'tot_pop_black_male', 'tot_pop_black_female', 'tot_pop_native_male',
       'tot_pop_native_female', 'tot_pop_asian_male', 'tot_pop_asian_female',
       'tot_pop_pacific_male', 'tot_pop_pacific_female',
       'tot_pop_twoplus_male', 'tot_pop_twoplus_female',
       'tot_pop_hispanic_male', 'tot_pop_hispanic_female', 'tot_pop_white',
       'tot_pop_black', 'tot_pop_native', 'tot_pop_asian', 'tot_pop_pacific',
       'tot_pop_twoplus', 'tot_pop_hispanic'],
      dtype='object')

## make rows for New York City, Kansas City, and Joplin

Since the NYTimes dataset treats `New York City`, `Kansas City`, and `Joplin` [as their own entities](https://github.com/nytimes/covid-19-data#geographic-exceptions), we need to add them to our population dataframe.

### New York City

`New York City` is the combination of these five counties, [which are coterminous with the five boroughs](https://en.wikipedia.org/wiki/New_York_City#Boroughs).

We will arbitrarily assign the `fips` as `36NYC`.

In [7]:
boroughs = ['Bronx', 'Kings', 'New York', 'Queens', 'Richmond']
nyc_fips = ['36005', '36047', '36061', '36081', '36085']

def combine_counties(source_df, method='sum', using='fips', age=False, 
                     fips=nyc_fips, counties=boroughs, state='New York', 
                     state_abbr='NY', county='New York City', state_fips='36', 
                     end_fips='36NYC'):
    
    # only combine numeric columns
    cols = source_df.select_dtypes(include='number').columns.tolist()
        
    # setting index makes masking data easier
    to_index = [using]
    # locate exact slice described by fips and cols
    if using=='fips':
        df = source_df.set_index(to_index).loc[fips, cols]
    elif using=='county':
        df = source_df.loc[source_df['state']==state].set_index(to_index)\
                      .loc[counties, cols]
    if age:
        df = df.groupby(by='agegrp')
    
    if method=='sum':
        if age:
            temp_df = df.sum().reset_index()
        else:
            temp_df = pd.DataFrame(
                [np.nansum(df, axis=0)], columns=cols
            )
#         temp_df = pd.DataFrame([values], columns=cols)
    elif method=='mean':
        if using=='fips':
            weights_df = pop_df.set_index(to_index).loc[fips, 'tot_pop']
        elif using=='county':
            weights_df = pop_df.loc[pop_df['state']=='New York']\
                           .set_index(to_index).loc[counties, 'tot_pop']
#         return weights_df
        masked_values = np.ma.masked_array(df.values,np.isnan(df.values))
        temp_df = pd.DataFrame(
            [np.sum(
                np.multiply(
                    masked_values, 
                    weights_df.values.reshape((-1, 1))
                ), 
                axis=0
             ) / weights_df.sum()],
            columns=cols
        )
    
    # the nominal info of the combined county
    for c in source_df.select_dtypes(exclude='number').columns:
        if 'state_fips' in c.lower():
            temp_df[c] = state_fips
        elif 'abbr' in c.lower():
            temp_df[c] = state_abbr
        elif 'county' in c.lower():
            temp_df[c] = county
        elif 'state' in c.lower():
            temp_df[c] = state
        elif 'fips' in c.lower():
            temp_df[c] = end_fips
            
    return temp_df

In [8]:
nyc_pop_df = combine_counties(pop_df, method='sum', using='fips', age=True)
nyc_pop_df.head()

Unnamed: 0,agegrp,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,state_fips,state,county,fips
0,0,8336817,3978439,4358378,1310690,1371286,828724,997124,9003,9861,583621,644977,2340,2704,72303,80594,1171758,1251832,2681976,1825848,18864,1228598,5044,152897,2423590,36,New York,New York City,36NYC
1,1,523718,268169,255549,78472,74068,53986,52540,332,354,34783,31883,217,189,10875,10412,89504,86103,152540,106526,686,66666,406,21287,175607,36,New York,New York City,36NYC
2,2,484313,247453,236860,65230,61726,51708,50238,357,325,32097,29782,141,123,9175,8963,88745,85703,126956,101946,682,61879,264,18138,174448,36,New York,New York City,36NYC
3,3,443786,226531,217255,58831,56158,51817,50956,796,698,28342,26175,92,108,5757,5688,80896,77472,114989,102773,1494,54517,200,11445,158368,36,New York,New York City,36NYC
4,4,439764,221600,218164,55914,55150,51748,52392,790,819,28509,27940,128,115,4696,4798,79815,76950,111064,104140,1609,56449,243,9494,156765,36,New York,New York City,36NYC


### calculating age coefficient and adding percentages

In [9]:
# engineer an 'age' column from the age group bins
def age_coefficient(df):
    
    grouped = df[df['agegrp']!=0].groupby(by='fips')
    cols = df.select_dtypes(include='number').columns.tolist()
    cols.remove('agegrp')
    age_cols = ['age_' + c[4:] for c in cols]
    
    def _age(g):
        return (g['agegrp'] * g.loc[:, cols].T).sum(axis=1)\
               / g.loc[:, cols].sum()
    
    adf = pd.DataFrame(grouped.apply(_age).values, columns=age_cols)
    df = df[df['agegrp']==0].drop(columns='agegrp').reset_index(drop=True)
    
    return pd.concat([df, adf], axis=1)

In [10]:
pop_df = pop_df.append(nyc_pop_df, ignore_index=True)
pop_df.head()

Unnamed: 0,state_fips,state,county,fips,agegrp,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic
0,1,Alabama,Autauga,1001,0,55869,27092,28777,20138,21077,5171,5927,105,138,282,364,20,20,492,464,884,787,41215,11098,243,646,40,956,1671
1,1,Alabama,Autauga,1001,1,3277,1713,1564,1180,1072,334,340,3,6,23,19,2,3,85,64,86,60,2252,674,9,42,5,149,146
2,1,Alabama,Autauga,1001,2,3465,1787,1678,1210,1134,388,359,7,8,16,25,0,1,78,81,88,70,2344,747,15,41,1,159,158
3,1,Alabama,Autauga,1001,3,3851,1977,1874,1362,1285,435,409,3,9,17,24,0,3,66,65,94,79,2647,844,12,41,3,131,173
4,1,Alabama,Autauga,1001,4,3659,1854,1805,1291,1272,429,397,4,0,21,13,3,3,43,46,63,74,2563,826,4,34,6,89,137


In [11]:
pop_df = pop_df.pipe(age_coefficient)

Kansas City and Joplin both refer to cities that cross county borders in Missouri. Therefore, we have to get our information from [census.gov quickfacts](https://www.census.gov/quickfacts). The level of demographic detail does not match our current dataframe, so we'll combine our age rows to form age coefficient columns, and then add whatever details we can.

We'll use `29KAN`, and `29JOP` as our `fips` for these two cities. 

In [12]:
mo_pop_df = pd.DataFrame(
    [['Missouri',
      'Kansas City',
      '29',
      '29KAN',
      495_327,
      int(0.551*495_327),   # white
      int(0.290*495_327),   # black
      int(0.004*495_327),   # native
      int(0.027*495_327),   # asian
      int(0.004*495_327),   # pacific
      int(0.035*495_327),   # twoplus
      int(0.102*495_327)],  # hispanic
     ['Missouri',
      'Joplin',
      '29',
      '29JOP',
      50_925,
      int(0.842*50_925),
      int(0.032*50_925),
      int(0.022*50_925),
      int(0.019*50_925),
      int(0.001*50_925),
      int(0.040*50_925),
      int(0.048*50_925)
     ]], 
    columns=['state', 'county', 'state_fips', 'fips', 'tot_pop', 'tot_pop_white', 
             'tot_pop_black', 'tot_pop_native', 'tot_pop_asian', 'tot_pop_pacific', 
             'tot_pop_twoplus', 'tot_pop_hispanic'])

In [13]:
pop_df[pop_df['fips']=='36NYC']

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic
3142,36,New York,New York City,36NYC,8336817,3978439,4358378,1310690,1371286,828724,997124,9003,9861,583621,644977,2340,2704,72303,80594,1171758,1251832,2681976,1825848,18864,1228598,5044,152897,2423590,9.173668,9.06319,9.294883,9.175985,9.507256,8.433333,8.333333,8.145161,8.222222,11.666667,9.626506,9.0,,6.616438,5.791045,7.819355,6.530769,9.333387,8.4,8.17757,10.168142,9.0,6.221429,7.231579


In [14]:
pop_df = pop_df.append(mo_pop_df, ignore_index=True)
# pop_df = pop_df.fillna(-1)
pop_df.tail()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic
3140,56,Wyoming,Washakie,56043,7805,3963.0,3842.0,3266.0,3151.0,23.0,15.0,19.0,34.0,22.0,33.0,0.0,0.0,68.0,66.0,565.0,543.0,6417,38,53,55,0,134,1108,8.025512,7.988165,8.063687,8.168103,8.313575,7.375,6.016129,8.014493,8.853333,7.0,8.607143,4.545455,7.076923,7.192308,5.892308,6.529666,6.106992,8.239961,6.706349,8.451389,7.978261,5.916667,6.650641,6.316408
3141,56,Wyoming,Weston,56045,6927,3624.0,3303.0,3273.0,2963.0,30.0,15.0,62.0,45.0,30.0,83.0,1.0,0.0,73.0,67.0,155.0,130.0,6236,45,107,113,1,140,285,9.065343,8.91673,9.218636,9.261788,9.638527,9.913043,8.466667,9.0,9.764706,8.409091,9.484848,,,7.029412,7.318182,7.125664,6.983425,9.446782,9.342105,9.490566,9.054545,,7.171642,7.055957
3142,36,New York,New York City,36NYC,8336817,3978439.0,4358378.0,1310690.0,1371286.0,828724.0,997124.0,9003.0,9861.0,583621.0,644977.0,2340.0,2704.0,72303.0,80594.0,1171758.0,1251832.0,2681976,1825848,18864,1228598,5044,152897,2423590,9.173668,9.06319,9.294883,9.175985,9.507256,8.433333,8.333333,8.145161,8.222222,11.666667,9.626506,9.0,,6.616438,5.791045,7.819355,6.530769,9.333387,8.4,8.17757,10.168142,9.0,6.221429,7.231579
3143,29,Missouri,Kansas City,29KAN,495327,,,,,,,,,,,,,,,,,272925,143644,1981,13373,1981,17336,50523,,,,,,,,,,,,,,,,,,,,,,,,
3144,29,Missouri,Joplin,29JOP,50925,,,,,,,,,,,,,,,,,42878,1629,1120,967,50,2037,2444,,,,,,,,,,,,,,,,,,,,,,,,


FUTURE WORK: impute numbers using neighbors

In [15]:
pop_df[pop_df['fips']=='29JOP']

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic
3144,29,Missouri,Joplin,29JOP,50925,,,,,,,,,,,,,,,,,42878,1629,1120,967,50,2037,2444,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
pop_df = pop_df.sort_values(by='fips')
with open('../data/processed/pop_df.p', 'wb') as file:
    pickle.dump(pop_df, file, protocol=pickle.HIGHEST_PROTOCOL)

# 2. add census region labels

In [17]:
with urlopen('https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv') as response:
    region_df = pd.read_csv(
        response
    )

region_df.columns = region_df.columns.str.lower()

region_df.head()

# elect_df.rename(
#     columns={
#         'county_name':'county',
#         'combined_fips':'fips',
#         'votes_dem':'tot_dem',
#         'votes_gop':'tot_gop',
#         'total_votes':'tot_votes'
#     }, inplace=True
# )

# elect_df[['county']] = elect_df[['county']].apply(remove_county_terms)

# # https://stackoverflow.com/a/23836353
# elect_df['fips'] = elect_df['fips'].apply('{0:0>5}'.format) 

# elect_cols = ['state_abbr', 'county', 'fips', 'tot_dem', 'tot_gop', 'tot_votes']
# elect_df = elect_df[elect_cols]
# elect_df = elect_df.sort_values(by='fips')
# elect_df.head()

Unnamed: 0,state,state code,region,division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


In [18]:
info_df = pop_df.merge(region_df[['state', 'region', 'division']], on='state')
info_df.head()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,5927.0,105.0,138.0,282.0,364.0,20.0,20.0,492.0,464.0,884.0,787.0,41215,11098,243,646,40,956,1671,8.422041,8.181973,8.648052,8.518373,8.928216,7.480758,8.104269,9.733333,9.826087,7.276596,8.381868,8.4,5.85,5.195122,5.519397,6.382353,7.072427,8.727963,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,South,East South Central
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,9907.0,753.0,754.0,911.0,1435.0,53.0,70.0,1832.0,1930.0,5545.0,4989.0,185747,19215,1507,2346,123,3762,10534,8.987202,8.806627,9.157192,9.150993,9.481721,7.587129,8.095791,9.14741,8.851459,7.246981,7.887108,8.150943,8.628571,5.757642,5.864767,6.497565,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,South,East South Central
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,5547.0,52.0,43.0,55.0,61.0,21.0,10.0,153.0,132.0,629.0,488.0,11235,11807,95,116,31,285,1117,8.784412,8.463564,9.14507,9.475568,10.196592,7.767732,8.491437,9.346154,10.465116,8.109091,10.065574,6.285714,7.7,6.418301,5.810606,6.434022,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,1807.0,50.0,41.0,21.0,25.0,5.0,1.0,116.0,130.0,343.0,280.0,16663,4719,91,46,6,246,623,8.606145,8.349484,8.89871,8.645838,9.105122,7.706387,8.455451,8.52,8.219512,7.52381,9.8,6.4,3.0,6.422414,6.3,7.186589,6.975,8.871332,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,South,East South Central
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,419.0,143.0,139.0,73.0,90.0,14.0,7.0,345.0,385.0,2950.0,2632.0,50176,872,282,163,21,730,5582,8.651714,8.49136,8.807249,8.771209,9.131843,8.030905,7.947494,10.125874,9.446043,8.30137,9.066667,8.142857,10.571429,6.486957,6.535065,6.4,6.06193,8.955796,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,South,East South Central


# 2. import geojson for boundaries and census areas

In [19]:
# https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

geo_df = gp.read_file('../data/external/cb_2018_us_county_20m/cb_2018_us_county_20m.shp')
geo_df = geo_df.sort_values(by='GEOID').reset_index(drop=True)
geo_df['ALAND'] = geo_df['ALAND'] / 1e6     # convert m^2 to km^2
geo_df = geo_df[['STATEFP', 'GEOID', 'ALAND', 'geometry']]
geo_df.rename(columns={
    'STATEFP': 'state_fips', 
    'GEOID': 'fips', 
    'ALAND': 'area_land'
}, inplace=True)
geo_df.head()

Unnamed: 0,state_fips,fips,area_land,geometry
0,1,1001,1539.602123,"POLYGON ((-86.91759 32.66417, -86.71339 32.661..."
1,1,1003,4117.546676,"POLYGON ((-88.02632 30.75336, -87.94455 30.827..."
2,1,1005,2292.144655,"POLYGON ((-85.73573 31.62449, -85.66565 31.786..."
3,1,1007,1612.167481,"POLYGON ((-87.42194 33.00338, -87.31854 33.006..."
4,1,1009,1670.103911,"POLYGON ((-86.96336 33.85822, -86.92439 33.909..."


## add areas to `geo_df`

GeoJSON data for the three areas compiled from [nomanatim](https://nominatim.openstreetmap.org/) and [polygons](http://polygons.openstreetmap.fr/):
- Search for the area at [nomanatim](https://nominatim.openstreetmap.org/).
- Select `details` from the relevant entry.
- Copy the numeric `code` under `OSM`, ignoring "relation". Eg. for New York City, copy `175905`.
- Search for the `code` at [polygons](http://polygons.openstreetmap.fr/).
- For our purposes, GeoJSONs were selected according to the following criteria: (1) sparsity of vertices (`NPoints`) and (2) accuracy of shape.

In [20]:
# new york city, ny
with open('../data/external/nyc.txt') as f:
    nyc_json = json.load(f)

# kansas city, mo
with open('../data/external/kcm.txt') as f:
    kcm_json = json.load(f)

# joplin, mo
with open('../data/external/jm.txt') as f:
    jm_json = json.load(f)
    
add_to_gdf = gp.GeoDataFrame(
    [['29', '29JOP', 98.61, asShape(jm_json).buffer(0)],
     ['29', '29KAN', 815.55, asShape(kcm_json).buffer(0)],
     ['36', '36NYC', 777.95, asShape(nyc_json).buffer(0)]], 
    columns=geo_df.columns
)

geo_df = geo_df.append(add_to_gdf, ignore_index=True)
geo_df = geo_df.sort_values(by='fips').reset_index(drop=True)

## find neighbors (for clustering later)

In [21]:
# https://gis.stackexchange.com/a/281676

def county_neighbors(g):
    
    indices = g['fips'].tolist()
    neighbor_matrix = []
    
    for i, row in g.iterrows():
        neighbors = g[g['geometry'].intersects(row['geometry'])]['fips'].tolist()
        neighbors.remove(row['fips'])
        neighbor_matrix.append(neighbors)
    
    g['neighbors'] = neighbor_matrix
    return g

geo_df = geo_df.groupby(by='state_fips').apply(county_neighbors)
geo_df.head()

Unnamed: 0,state_fips,fips,area_land,geometry,neighbors
0,1,1001,1539.602123,"POLYGON ((-86.91759 32.66417, -86.71339 32.661...","[01021, 01047, 01051, 01085, 01101]"
1,1,1003,4117.546676,"POLYGON ((-88.02632 30.75336, -87.94455 30.827...","[01025, 01053, 01097, 01099, 01129]"
2,1,1005,2292.144655,"POLYGON ((-85.73573 31.62449, -85.66565 31.786...","[01011, 01045, 01067, 01109, 01113]"
3,1,1007,1612.167481,"POLYGON ((-87.42194 33.00338, -87.31854 33.006...","[01021, 01065, 01073, 01105, 01117, 01125]"
4,1,1009,1670.103911,"POLYGON ((-86.96336 33.85822, -86.92439 33.909...","[01043, 01055, 01073, 01095, 01115, 01127]"


## find centroids

We will use `shapely` to calculate the [centroid](https://en.wikipedia.org/wiki/Centroid) coordinates for the counties (in case we wish to plot bubble maps).

In [22]:
def centroid(df):
    centroids = df['geometry'].centroid
    return [c.coords[0] for c in centroids]

geo_df['lon'], geo_df['lat'] = zip(*geo_df.pipe(centroid))
geo_df.head()

Unnamed: 0,state_fips,fips,area_land,geometry,neighbors,lon,lat
0,1,1001,1539.602123,"POLYGON ((-86.91759 32.66417, -86.71339 32.661...","[01021, 01047, 01051, 01085, 01101]",-86.643648,32.538666
1,1,1003,4117.546676,"POLYGON ((-88.02632 30.75336, -87.94455 30.827...","[01025, 01053, 01097, 01099, 01129]",-87.722603,30.729584
2,1,1005,2292.144655,"POLYGON ((-85.73573 31.62449, -85.66565 31.786...","[01011, 01045, 01067, 01109, 01113]",-85.387579,31.868235
3,1,1007,1612.167481,"POLYGON ((-87.42194 33.00338, -87.31854 33.006...","[01021, 01065, 01073, 01105, 01117, 01125]",-87.125115,32.996421
4,1,1009,1670.103911,"POLYGON ((-86.96336 33.85822, -86.92439 33.909...","[01043, 01055, 01073, 01095, 01115, 01127]",-86.568495,33.98143


In [23]:
with open('../data/processed/geo_df.p', 'wb') as file:
    pickle.dump(geo_df, file, protocol=pickle.HIGHEST_PROTOCOL)

## merge with `pop_df` to begin building `info_df`

In [24]:
info_df = info_df.merge(geo_df[['fips', 'area_land', 'lon', 'lat', 'neighbors']], on='fips')
info_df.head()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,5927.0,105.0,138.0,282.0,364.0,20.0,20.0,492.0,464.0,884.0,787.0,41215,11098,243,646,40,956,1671,8.422041,8.181973,8.648052,8.518373,8.928216,7.480758,8.104269,9.733333,9.826087,7.276596,8.381868,8.4,5.85,5.195122,5.519397,6.382353,7.072427,8.727963,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,South,East South Central,1539.602123,-86.643648,32.538666,"[01021, 01047, 01051, 01085, 01101]"
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,9907.0,753.0,754.0,911.0,1435.0,53.0,70.0,1832.0,1930.0,5545.0,4989.0,185747,19215,1507,2346,123,3762,10534,8.987202,8.806627,9.157192,9.150993,9.481721,7.587129,8.095791,9.14741,8.851459,7.246981,7.887108,8.150943,8.628571,5.757642,5.864767,6.497565,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,South,East South Central,4117.546676,-87.722603,30.729584,"[01025, 01053, 01097, 01099, 01129]"
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,5547.0,52.0,43.0,55.0,61.0,21.0,10.0,153.0,132.0,629.0,488.0,11235,11807,95,116,31,285,1117,8.784412,8.463564,9.14507,9.475568,10.196592,7.767732,8.491437,9.346154,10.465116,8.109091,10.065574,6.285714,7.7,6.418301,5.810606,6.434022,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central,2292.144655,-85.387579,31.868235,"[01011, 01045, 01067, 01109, 01113]"
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,1807.0,50.0,41.0,21.0,25.0,5.0,1.0,116.0,130.0,343.0,280.0,16663,4719,91,46,6,246,623,8.606145,8.349484,8.89871,8.645838,9.105122,7.706387,8.455451,8.52,8.219512,7.52381,9.8,6.4,3.0,6.422414,6.3,7.186589,6.975,8.871332,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,South,East South Central,1612.167481,-87.125115,32.996421,"[01021, 01065, 01073, 01105, 01117, 01125]"
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,419.0,143.0,139.0,73.0,90.0,14.0,7.0,345.0,385.0,2950.0,2632.0,50176,872,282,163,21,730,5582,8.651714,8.49136,8.807249,8.771209,9.131843,8.030905,7.947494,10.125874,9.446043,8.30137,9.066667,8.142857,10.571429,6.486957,6.535065,6.4,6.06193,8.955796,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,South,East South Central,1670.103911,-86.568495,33.98143,"[01043, 01055, 01073, 01095, 01115, 01127]"


# 3. add 2016 general election data

Mask compliance has been very political, so it would be interesting to see how political differences vary by county. Data taken from [github.com/tonmcg](https://github.com/tonmcg). Alaska data taken from [RRH Elections](https://rrhelections.com/index.php/2018/02/02/alaska-results-by-county-equivalent-1960-2016/).

In [25]:
with urlopen('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-16/master/2016_US_County_Level_Presidential_Results.csv') as response:
    elect_df = pd.read_csv(
        response,
        encoding='latin-1',        # to avoid unicode error
        dtype={
            'votes_dem':'int',
            'votes_gop':'int',
            'total_votes':'int',
            'combined_fips':'str'},
        index_col=0
    )

elect_df.rename(
    columns={
        'county_name':'county',
        'combined_fips':'fips',
        'votes_dem':'tot_dem',
        'votes_gop':'tot_gop',
        'total_votes':'tot_votes'
    }, inplace=True
)

elect_df[['county']] = elect_df[['county']].apply(remove_county_terms)

# https://stackoverflow.com/a/23836353
elect_df['fips'] = elect_df['fips'].apply('{0:0>5}'.format) 

elect_cols = ['state_abbr', 'county', 'fips', 'tot_dem', 'tot_gop', 'tot_votes']
elect_df = elect_df[elect_cols]
elect_df = elect_df.sort_values(by='fips')
elect_df.head()

Unnamed: 0,state_abbr,county,fips,tot_dem,tot_gop,tot_votes
29,AL,Autauga,1001,5908,18110,24661
30,AL,Baldwin,1003,18409,72780,94090
31,AL,Barbour,1005,4848,5431,10390
32,AL,Bibb,1007,1874,6733,8748
33,AL,Blount,1009,2150,22808,25384


## add New York City, Kansas City, and Joplin election data

In [26]:
nyc_elect_df = combine_counties(elect_df, method='sum', using='fips')
nyc_elect_df

Unnamed: 0,tot_dem,tot_gop,tot_votes,state_abbr,county,fips
0,1969920,461174,2490750,NY,New York City,36NYC


In [27]:
# estimate joplin
jop_fips = ['29097', '29145']
jop_elect_df = combine_counties(elect_df, using='fips', method='sum', state_abbr='MO', county='Joplin', fips=jop_fips, end_fips='29JOP')
jop_elect_df

Unnamed: 0,tot_dem,tot_gop,tot_votes,state_abbr,county,fips
0,15553,55604,74685,MO,Joplin,29JOP


In [28]:
# https://en.wikipedia.org/wiki/2016_United_States_presidential_election_in_Missouri

kan_elect_df = pd.DataFrame(
    [['MO', 'Kansas City', '29KAN', 97735, 24654, 128601]]
    , columns=elect_df.columns)
kan_elect_df

Unnamed: 0,state_abbr,county,fips,tot_dem,tot_gop,tot_votes
0,MO,Kansas City,29KAN,97735,24654,128601


In [29]:
elect_df = elect_df.append([nyc_elect_df, jop_elect_df, kan_elect_df], ignore_index=True)
elect_df.tail()

Unnamed: 0,state_abbr,county,fips,tot_dem,tot_gop,tot_votes
3139,WY,Washakie,56043,532,2911,3715
3140,WY,Weston,56045,294,2898,3334
3141,NY,New York City,36NYC,1969920,461174,2490750
3142,MO,Joplin,29JOP,15553,55604,74685
3143,MO,Kansas City,29KAN,97735,24654,128601


## add alaska elections data

Data taken from [RRH Elections](https://rrhelections.com/index.php/2018/02/02/alaska-results-by-county-equivalent-1960-2016/).

In [30]:
ak_elect_df = pd.read_excel('../data/external/2016 AK Gen Official.xlsx', sheet_name='By CE')
ak_elect_df = ak_elect_df.iloc[0:29, 0:12]
ak_elect_df.rename(
    columns={
        'Trump, Donald J. ':'tot_gop',
        'Clinton, Hillary ':'tot_dem'
    }, inplace=True
)
ak_elect_df = ak_elect_df[['ED/Muni', 'tot_gop', 'tot_dem', 'ED Total']].sort_values(by='ED/Muni')
ak_elect_df[['tot_gop', 'tot_dem', 'ED Total']] = ak_elect_df[['tot_gop', 'tot_dem', 'ED Total']].astype(int)
ak_elect_df = ak_elect_df.sort_values(by='ED/Muni')
ak_elect_df.head()

Unnamed: 0,ED/Muni,tot_gop,tot_dem,ED Total
22,Aleutians East,198,121,369
24,Aleutians West,260,493,846
19,Anchorage,39942,32130,81678
12,Bethel,809,2178,3933
25,Bristol Bay,180,99,316


In [31]:
# just checking lengths
print(len(ak_elect_df))
print(len(elect_df[elect_df['state_abbr'] == 'AK']))

29
29


In [32]:
elect_df.loc[
    elect_df['state_abbr'] == 'AK', ['tot_gop', 'tot_dem', 'tot_votes']
] = ak_elect_df[['tot_gop', 'tot_dem', 'ED Total']].values
elect_df.tail()

Unnamed: 0,state_abbr,county,fips,tot_dem,tot_gop,tot_votes
3139,WY,Washakie,56043,532,2911,3715
3140,WY,Weston,56045,294,2898,3334
3141,NY,New York City,36NYC,1969920,461174,2490750
3142,MO,Joplin,29JOP,15553,55604,74685
3143,MO,Kansas City,29KAN,97735,24654,128601


In [33]:
elect_df.loc[:, 'per_gop'] = elect_df.loc[:, 'tot_gop']\
                             / (elect_df.loc[:, 'tot_gop']
                                + elect_df.loc[:, 'tot_dem'])
elect_df.head()

Unnamed: 0,state_abbr,county,fips,tot_dem,tot_gop,tot_votes,per_gop
0,AL,Autauga,1001,5908,18110,24661,0.754018
1,AL,Baldwin,1003,18409,72780,94090,0.798123
2,AL,Barbour,1005,4848,5431,10390,0.528359
3,AL,Bibb,1007,1874,6733,8748,0.78227
4,AL,Blount,1009,2150,22808,25384,0.913855


In [34]:
with open('../data/processed/elect_df.p', 'wb') as file:
    pickle.dump(elect_df, file, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
# elect_df = pd.read_csv('../data/elect_df.csv')

In [36]:
info_df = info_df.merge(elect_df.loc[:, 'fips':'per_gop'], on='fips', how='left')
info_df.tail()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop
3140,56,Wyoming,Sweetwater,56037,42343,21808.0,20535.0,17223.0,16338.0,298.0,204.0,177.0,158.0,183.0,227.0,26.0,29.0,350.0,358.0,3551.0,3221.0,33561,502,335,410,55,708,6772,8.94558,8.95502,8.934678,9.201322,9.167117,10.085106,8.962963,8.666667,7.542857,7.0,7.970588,2.0,14.0,5.837838,6.323077,6.616402,6.753086,9.185418,9.675676,8.175,7.485294,8.0,6.064748,6.679487,West,Mountain,27005.754244,-108.882788,41.659439,"[56007, 56013, 56023, 56035, 56041]",3233.0,12153.0,16661.0,0.789874
3141,56,Wyoming,Teton,56039,23464,12142.0,11322.0,9832.0,9168.0,87.0,58.0,33.0,41.0,135.0,243.0,11.0,7.0,160.0,135.0,1884.0,1670.0,19000,145,74,378,18,295,3554,7.942493,7.886051,8.002435,8.187366,8.316012,7.614094,7.990196,7.960452,8.310127,6.431694,8.026432,7.038462,7.689655,6.022857,5.368715,6.708533,6.6914,8.249993,7.766932,8.125373,7.314634,7.381818,5.69209,6.700384,West,Mountain,10351.784301,-110.589071,43.935211,"[56013, 56023, 56029, 56035]",7313.0,3920.0,12176.0,0.348972
3142,56,Wyoming,Uinta,56041,20226,10224.0,10002.0,8935.0,8722.0,64.0,62.0,69.0,75.0,36.0,56.0,11.0,13.0,182.0,130.0,927.0,944.0,17657,126,144,92,24,312,1871,8.657688,8.632103,8.685126,9.090419,9.172339,9.16092,8.896552,9.424242,8.878049,7.711111,7.452675,10.090909,9.0,6.05625,6.362963,6.478238,6.364072,9.129947,9.055172,9.121622,7.544974,9.666667,6.19661,6.424592,West,Mountain,5391.631764,-110.547578,41.287818,"[56023, 56037]",1202.0,6154.0,8053.0,0.836596
3143,56,Wyoming,Washakie,56043,7805,3963.0,3842.0,3266.0,3151.0,23.0,15.0,19.0,34.0,22.0,33.0,0.0,0.0,68.0,66.0,565.0,543.0,6417,38,53,55,0,134,1108,8.025512,7.988165,8.063687,8.168103,8.313575,7.375,6.016129,8.014493,8.853333,7.0,8.607143,4.545455,7.076923,7.192308,5.892308,6.529666,6.106992,8.239961,6.706349,8.451389,7.978261,5.916667,6.650641,6.316408,West,Mountain,5798.138762,-107.680187,43.904516,"[56003, 56013, 56017, 56019, 56025, 56029]",532.0,2911.0,3715.0,0.845484
3144,56,Wyoming,Weston,56045,6927,3624.0,3303.0,3273.0,2963.0,30.0,15.0,62.0,45.0,30.0,83.0,1.0,0.0,73.0,67.0,155.0,130.0,6236,45,107,113,1,140,285,9.065343,8.91673,9.218636,9.261788,9.638527,9.913043,8.466667,9.0,9.764706,8.409091,9.484848,,,7.029412,7.318182,7.125664,6.983425,9.446782,9.342105,9.490566,9.054545,,7.171642,7.055957,West,Mountain,6210.804116,-104.567368,43.840251,"[56005, 56009, 56011, 56027]",294.0,2898.0,3334.0,0.907895


# 4. add income data

Median income statistics taken from [data.census.gov](https://data.census.gov/cedsci/table?q=s1901&tid=ACSST1Y2018.S1901) (2017 ACS 1-Year Estimates). 

- `S1903_C03_001E` -- all households
- `S1903_C03_003E` -- black
- `S1903_C03_004E` -- native
- `S1903_C03_005E` -- asian
- `S1903_C03_006E` -- pacific
- `S1903_C03_007E` -- other
- `S1903_C03_008E` -- two or more
- `S1903_C03_009E` -- hispanic
- `S1903_C03_010E` -- white only, not hispanic

In [37]:
inc_cols = [f'S1903_C03_{i:03d}E' for i in range(1,11) if i != 2]
inc_col_names = ['median_income'] + [f'median_income_{race}'
                                     for race in ['black', 'native', 'asian', 'pacific', 'other', 'twoplus', 'hispanic', 'white']]
inc_dict = dict(zip(inc_cols, inc_col_names))
inc_dict.update({'GEO_ID':'fips'})

# can't use dtype 'int' here because of entries like `250000+` and `-`
inc_df = pd.read_csv(
    '../data/external/ACSST5Y2018.S1903/ACSST5Y2018.S1903_data_with_overlays.csv',
    usecols=['GEO_ID', 'NAME'] + inc_cols,
)
inc_df = inc_df.drop(0, axis=0)
inc_df = inc_df.rename(columns=inc_dict)
# inc_df['median_income'] = inc_df['median_income'].astype(float)

# joplin and kansas city
inc_df.at[inc_df['fips'] == '1600000US2937592', 'fips'] = '29JOP'
inc_df.at[inc_df['fips'] == '1600000US2938000', 'fips'] = '29KAN'
inc_df['fips'] = inc_df['fips'].str[-5:]

inc_df['county'], inc_df['state'] = zip(*inc_df['NAME'].str.split(', ').tolist())
inc_df.at[inc_df['fips'] == '29JOP', 'county'] = 'Joplin'
inc_df.at[inc_df['fips'] == '29KAN', 'county'] = 'Kansas City'
inc_df = inc_df.drop('NAME', axis=1)
inc_df['county'] = inc_df[['county']].apply(remove_county_terms)

# rio arriba taken from datausa.io
inc_df.at[inc_df['fips'] == '35039', 'median_income'] = 33_422

inc_df = inc_df.replace({'-': np.nan, '2,500-':2500, '250,000+':250000})

# can't do int because of nan
inc_df[inc_col_names] = inc_df[inc_col_names].astype(float)

inc_df.tail()

Unnamed: 0,fips,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,county,state
3218,72149,19855.0,25714.0,,,,19535.0,17871.0,19807.0,,Villalba Municipio,Puerto Rico
3219,72151,16013.0,14852.0,,,,29063.0,19213.0,15992.0,,Yabucoa Municipio,Puerto Rico
3220,72153,14954.0,13986.0,,,,12204.0,12650.0,14927.0,,Yauco Municipio,Puerto Rico
3221,29JOP,42782.0,32500.0,39663.0,,,,41033.0,47208.0,43473.0,Joplin,Missouri
3222,29KAN,52405.0,33899.0,48929.0,49367.0,33563.0,31768.0,50538.0,44003.0,65637.0,Kansas City,Missouri


In [38]:
inc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3222 entries, 1 to 3222
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   fips                    3222 non-null   object 
 1   median_income           3222 non-null   float64
 2   median_income_black     2019 non-null   float64
 3   median_income_native    1423 non-null   float64
 4   median_income_asian     1405 non-null   float64
 5   median_income_pacific   281 non-null    float64
 6   median_income_other     1689 non-null   float64
 7   median_income_twoplus   2190 non-null   float64
 8   median_income_hispanic  2555 non-null   float64
 9   median_income_white     3161 non-null   float64
 10  county                  3222 non-null   object 
 11  state                   3222 non-null   object 
dtypes: float64(9), object(3)
memory usage: 327.2+ KB


In [39]:
nyc_inc_df = combine_counties(inc_df, using='fips', method='mean')
nyc_inc_df

Unnamed: 0,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,fips,county,state
0,61884.61918,48418.297368,44901.135987,64497.662803,52705.343988,40763.957261,59931.081391,44189.568491,83433.357735,36NYC,New York City,New York


In [40]:
inc_df = inc_df.append(nyc_inc_df, ignore_index=True)
# inc_df[inc_col_names] = inc_df[inc_col_names].astype(int)
inc_df.tail()

Unnamed: 0,fips,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,county,state
3218,72151,16013.0,14852.0,,,,29063.0,19213.0,15992.0,,Yabucoa Municipio,Puerto Rico
3219,72153,14954.0,13986.0,,,,12204.0,12650.0,14927.0,,Yauco Municipio,Puerto Rico
3220,29JOP,42782.0,32500.0,39663.0,,,,41033.0,47208.0,43473.0,Joplin,Missouri
3221,29KAN,52405.0,33899.0,48929.0,49367.0,33563.0,31768.0,50538.0,44003.0,65637.0,Kansas City,Missouri
3222,36NYC,61884.61918,48418.297368,44901.135987,64497.662803,52705.343988,40763.957261,59931.081391,44189.568491,83433.357735,New York City,New York


In [41]:
# income_df = pd.read_csv('../data/income_df.csv')

In [42]:
inc_cols = ['state', 'county', 'fips']\
           + inc_df.select_dtypes(include='number').columns.tolist()
inc_df = inc_df[inc_cols]
inc_df.tail()

Unnamed: 0,state,county,fips,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white
3218,Puerto Rico,Yabucoa Municipio,72151,16013.0,14852.0,,,,29063.0,19213.0,15992.0,
3219,Puerto Rico,Yauco Municipio,72153,14954.0,13986.0,,,,12204.0,12650.0,14927.0,
3220,Missouri,Joplin,29JOP,42782.0,32500.0,39663.0,,,,41033.0,47208.0,43473.0
3221,Missouri,Kansas City,29KAN,52405.0,33899.0,48929.0,49367.0,33563.0,31768.0,50538.0,44003.0,65637.0
3222,New York,New York City,36NYC,61884.61918,48418.297368,44901.135987,64497.662803,52705.343988,40763.957261,59931.081391,44189.568491,83433.357735


In [43]:
with open('../data/processed/inc_df.p', 'wb') as file:
    pickle.dump(inc_df, file, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
info_df = info_df.merge(inc_df.loc[:, 'fips':], on='fips', how='left')
info_df.tail()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white
3140,56,Wyoming,Sweetwater,56037,42343,21808.0,20535.0,17223.0,16338.0,298.0,204.0,177.0,158.0,183.0,227.0,26.0,29.0,350.0,358.0,3551.0,3221.0,33561,502,335,410,55,708,6772,8.94558,8.95502,8.934678,9.201322,9.167117,10.085106,8.962963,8.666667,7.542857,7.0,7.970588,2.0,14.0,5.837838,6.323077,6.616402,6.753086,9.185418,9.675676,8.175,7.485294,8.0,6.064748,6.679487,West,Mountain,27005.754244,-108.882788,41.659439,"[56007, 56013, 56023, 56035, 56041]",3233.0,12153.0,16661.0,0.789874,73008.0,,62188.0,72614.0,138053.0,74189.0,55284.0,61921.0,76469.0
3141,56,Wyoming,Teton,56039,23464,12142.0,11322.0,9832.0,9168.0,87.0,58.0,33.0,41.0,135.0,243.0,11.0,7.0,160.0,135.0,1884.0,1670.0,19000,145,74,378,18,295,3554,7.942493,7.886051,8.002435,8.187366,8.316012,7.614094,7.990196,7.960452,8.310127,6.431694,8.026432,7.038462,7.689655,6.022857,5.368715,6.708533,6.6914,8.249993,7.766932,8.125373,7.314634,7.381818,5.69209,6.700384,West,Mountain,10351.784301,-110.589071,43.935211,"[56013, 56023, 56029, 56035]",7313.0,3920.0,12176.0,0.348972,83831.0,,,98125.0,,36433.0,17188.0,45361.0,95222.0
3142,56,Wyoming,Uinta,56041,20226,10224.0,10002.0,8935.0,8722.0,64.0,62.0,69.0,75.0,36.0,56.0,11.0,13.0,182.0,130.0,927.0,944.0,17657,126,144,92,24,312,1871,8.657688,8.632103,8.685126,9.090419,9.172339,9.16092,8.896552,9.424242,8.878049,7.711111,7.452675,10.090909,9.0,6.05625,6.362963,6.478238,6.364072,9.129947,9.055172,9.121622,7.544974,9.666667,6.19661,6.424592,West,Mountain,5391.631764,-110.547578,41.287818,"[56023, 56037]",1202.0,6154.0,8053.0,0.836596,58235.0,,68125.0,,,55701.0,39205.0,39816.0,61330.0
3143,56,Wyoming,Washakie,56043,7805,3963.0,3842.0,3266.0,3151.0,23.0,15.0,19.0,34.0,22.0,33.0,0.0,0.0,68.0,66.0,565.0,543.0,6417,38,53,55,0,134,1108,8.025512,7.988165,8.063687,8.168103,8.313575,7.375,6.016129,8.014493,8.853333,7.0,8.607143,4.545455,7.076923,7.192308,5.892308,6.529666,6.106992,8.239961,6.706349,8.451389,7.978261,5.916667,6.650641,6.316408,West,Mountain,5798.138762,-107.680187,43.904516,"[56003, 56013, 56017, 56019, 56025, 56029]",532.0,2911.0,3715.0,0.845484,53426.0,,,,,62054.0,36118.0,50035.0,54815.0
3144,56,Wyoming,Weston,56045,6927,3624.0,3303.0,3273.0,2963.0,30.0,15.0,62.0,45.0,30.0,83.0,1.0,0.0,73.0,67.0,155.0,130.0,6236,45,107,113,1,140,285,9.065343,8.91673,9.218636,9.261788,9.638527,9.913043,8.466667,9.0,9.764706,8.409091,9.484848,,,7.029412,7.318182,7.125664,6.983425,9.446782,9.342105,9.490566,9.054545,,7.171642,7.055957,West,Mountain,6210.804116,-104.567368,43.840251,"[56005, 56009, 56011, 56027]",294.0,2898.0,3334.0,0.907895,52867.0,,,,,,,37870.0,55032.0


# 5. add educational attainment data

Educational attainment statistics taken from [data.census.gov](https://data.census.gov/cedsci/table?tid=ACSST1Y2018.S1501&g=0400000US04) (2017 ACS 5-Year Estimates).

- `S1501_C01_006E` -- population > 25yo
- `S1501_C01_007E` -- less than 9th grade
- `S1501_C01_008E` -- some high school
- `S1501_C01_009E` -- high school or GED
- `S1501_C01_010E` -- some college
- `S1501_C01_011E` -- associate's
- `S1501_C01_012E` -- bachelor's
- `S1501_C01_013E` -- graduate or professional

In addition, there are ethnic / sex breakdowns:
- general patterns:
    - `CO1_XXXE` -- ethnic total
    - `CO3_XXXE` -- ethnic male
    - `CO5_XXXE` -- ethnic female
- `S1501_C01_031E` -- white alone
- `S1501_C01_032E` -- white alone, high school graduate or higher
- `S1501_C01_033E` -- white alone, bachelor's degree or higher
- etc.

In [45]:
# general educational attainment columns
edu_cols = [f'S1501_C01_{i:03d}E' for i in range(6,14)]
edu_col_names = ['pop_25p', 'no_hs', 'some_hs', 'hs', 'some_college', 
                 'associates', 'bachelors', 'graduate']
edu_dict = dict(zip(edu_cols, edu_col_names))
edu_dict.update({'GEO_ID':'fips'})

# education/ethnicity/sex columns
edu_eth_sex_cols = [f'S1501_C{i:02d}_{j:03d}E' for i in range(1,6,2) for j in range(31,55)]
edu_eth_sex_col_names = [f'tot_edu_{race}{sex}{edu}' 
                         for sex in ['', '_male', '_female']
                         for race in ['white', 'black', 'native', 'asian', 'pacific', 'other', 'twoplus', 'hispanic']
                         for edu in ['', '_hsplus', '_4yplus']]
edu_eth_sex_dict = dict(zip(edu_eth_sex_cols, edu_eth_sex_col_names))
edu_dict.update(edu_eth_sex_dict)

edu_df = pd.read_csv('../data/external/ACSST5Y2018.S1501/ACSST5Y2018.S1501_data_with_overlays.csv',
                     usecols=['GEO_ID', 'NAME']+edu_cols+edu_eth_sex_cols)
edu_df = edu_df.drop(0, axis=0)
for col in (edu_cols+edu_eth_sex_cols):
    edu_df[col] = edu_df[col].astype(int)
edu_df.rename(
    columns=edu_dict,
    inplace=True
)

# joplin and kansas city
edu_df.at[edu_df['fips'] == '1600000US2937592', 'fips'] = '29JOP'
edu_df.at[edu_df['fips'] == '1600000US2938000', 'fips'] = '29KAN'
edu_df['fips'] = edu_df['fips'].str[-5:]

edu_df['county'], edu_df['state'] = zip(*edu_df['NAME'].str.split(', ').tolist())
edu_df.at[edu_df['fips'] == '29JOP', 'county'] = 'Joplin'
edu_df.at[edu_df['fips'] == '29KAN', 'county'] = 'Kansas City'
edu_df = edu_df.drop('NAME', axis=1)
edu_df['county'] = edu_df[['county']].apply(remove_county_terms)

edu_df.head()

Unnamed: 0,fips,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,tot_edu_black,tot_edu_black_male,tot_edu_black_female,tot_edu_black_hsplus,tot_edu_black_male_hsplus,tot_edu_black_female_hsplus,tot_edu_black_4yplus,tot_edu_black_male_4yplus,tot_edu_black_female_4yplus,tot_edu_native,tot_edu_native_male,tot_edu_native_female,tot_edu_native_hsplus,tot_edu_native_male_hsplus,tot_edu_native_female_hsplus,tot_edu_native_4yplus,tot_edu_native_male_4yplus,tot_edu_native_female_4yplus,tot_edu_asian,tot_edu_asian_male,tot_edu_asian_female,tot_edu_asian_hsplus,tot_edu_asian_male_hsplus,tot_edu_asian_female_hsplus,tot_edu_asian_4yplus,tot_edu_asian_male_4yplus,tot_edu_asian_female_4yplus,tot_edu_pacific,tot_edu_pacific_male,tot_edu_pacific_female,tot_edu_pacific_hsplus,tot_edu_pacific_male_hsplus,tot_edu_pacific_female_hsplus,tot_edu_pacific_4yplus,tot_edu_pacific_male_4yplus,tot_edu_pacific_female_4yplus,tot_edu_other,tot_edu_other_male,tot_edu_other_female,tot_edu_other_hsplus,tot_edu_other_male_hsplus,tot_edu_other_female_hsplus,tot_edu_other_4yplus,tot_edu_other_male_4yplus,tot_edu_other_female_4yplus,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state
1,1001,28726,13834,14892,26130,12588,13542,8440,4573,3867,6786,3042,3744,5459,2436,3023,1296,573,723,78,39,39,61,25,36,25,25,0,317,95,222,278,62,216,118,43,75,32,5,27,32,5,27,0,0,0,262,93,169,197,67,130,0,0,0,380,135,245,269,73,196,92,0,92,939,455,484,821,380,441,346,230,116,37166,956,3248,12119,7554,2998,5903,4388,Autauga,Alabama
2,1003,126316,60310,66006,116288,54788,61500,41648,19863,21785,12006,5593,6413,9565,4129,5436,2164,808,1356,1015,523,492,790,410,380,145,81,64,1180,426,754,980,271,709,243,147,96,9,0,9,9,0,9,0,0,0,938,469,469,695,394,301,262,119,143,1712,853,859,1529,744,785,559,199,360,5119,2749,2370,3823,1813,2010,1389,637,752,146989,3978,10332,40579,32266,13759,30431,15644,Baldwin,Alabama
3,1005,9171,4846,4325,7264,3657,3607,1578,814,764,8137,4304,3833,5551,2776,2775,552,240,312,72,72,0,42,42,0,0,0,0,88,40,48,72,27,45,5,0,5,1,0,1,0,0,0,0,0,0,345,230,115,100,76,24,44,44,0,183,80,103,153,50,103,9,0,9,573,395,178,251,192,59,76,60,16,18173,1490,3411,6486,3287,1279,1417,803,Barbour,Alabama
4,1007,12002,6037,5965,10483,5181,5302,1570,674,896,3316,2146,1170,2296,1377,919,200,83,117,8,8,0,8,8,0,0,0,0,37,16,21,37,16,21,37,16,21,0,0,0,0,0,0,0,0,0,9,9,0,9,9,0,0,0,0,149,108,41,108,89,19,6,6,0,313,171,142,206,95,111,0,0,0,15780,903,1747,7471,2938,908,1197,616,Bibb,Alabama
5,1009,35774,17200,18574,29814,14167,15647,4775,1900,2875,596,281,315,411,192,219,22,10,12,132,22,110,94,14,80,13,13,0,124,43,81,104,43,61,62,25,37,18,0,18,18,0,18,0,0,0,211,106,105,120,56,64,90,28,62,440,212,228,361,154,207,24,22,2,2610,1468,1142,1006,476,530,82,48,34,39627,2967,4894,13489,8492,4775,3217,1793,Blount,Alabama


In [46]:
nyc_edu_df = combine_counties(edu_df, using='fips', method='sum')
nyc_edu_df

Unnamed: 0,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,tot_edu_black,tot_edu_black_male,tot_edu_black_female,tot_edu_black_hsplus,tot_edu_black_male_hsplus,tot_edu_black_female_hsplus,tot_edu_black_4yplus,tot_edu_black_male_4yplus,tot_edu_black_female_4yplus,tot_edu_native,tot_edu_native_male,tot_edu_native_female,tot_edu_native_hsplus,tot_edu_native_male_hsplus,tot_edu_native_female_hsplus,tot_edu_native_4yplus,tot_edu_native_male_4yplus,tot_edu_native_female_4yplus,tot_edu_asian,tot_edu_asian_male,tot_edu_asian_female,tot_edu_asian_hsplus,tot_edu_asian_male_hsplus,tot_edu_asian_female_hsplus,tot_edu_asian_4yplus,tot_edu_asian_male_4yplus,tot_edu_asian_female_4yplus,tot_edu_pacific,tot_edu_pacific_male,tot_edu_pacific_female,tot_edu_pacific_hsplus,tot_edu_pacific_male_hsplus,tot_edu_pacific_female_hsplus,tot_edu_pacific_4yplus,tot_edu_pacific_male_4yplus,tot_edu_pacific_female_4yplus,tot_edu_other,tot_edu_other_male,tot_edu_other_female,tot_edu_other_hsplus,tot_edu_other_male_hsplus,tot_edu_other_female_hsplus,tot_edu_other_4yplus,tot_edu_other_male_4yplus,tot_edu_other_female_4yplus,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,fips,county,state
0,2064537,997110,1067427,1924581,929762,994819,1206238,580787,625451,1398115,601774,796341,1160648,495537,665111,338438,128228,210210,24268,12218,12050,16850,8277,8573,4195,1866,2329,868465,404417,464048,656860,313388,343472,362529,170329,192200,3124,1501,1623,2552,1203,1349,781,450,331,797060,374668,422392,511112,240567,270545,110988,46547,64441,163206,75108,88098,130813,59869,70944,60444,25802,34642,1571933,740285,831648,1064667,500637,564030,281071,120324,160747,5923498,565345,523873,1421617,815961,379457,1292814,924431,36NYC,New York City,New York


In [47]:
edu_df = edu_df.append(nyc_edu_df, ignore_index=True)
edu_df.tail()

Unnamed: 0,fips,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,tot_edu_black,tot_edu_black_male,tot_edu_black_female,tot_edu_black_hsplus,tot_edu_black_male_hsplus,tot_edu_black_female_hsplus,tot_edu_black_4yplus,tot_edu_black_male_4yplus,tot_edu_black_female_4yplus,tot_edu_native,tot_edu_native_male,tot_edu_native_female,tot_edu_native_hsplus,tot_edu_native_male_hsplus,tot_edu_native_female_hsplus,tot_edu_native_4yplus,tot_edu_native_male_4yplus,tot_edu_native_female_4yplus,tot_edu_asian,tot_edu_asian_male,tot_edu_asian_female,tot_edu_asian_hsplus,tot_edu_asian_male_hsplus,tot_edu_asian_female_hsplus,tot_edu_asian_4yplus,tot_edu_asian_male_4yplus,tot_edu_asian_female_4yplus,tot_edu_pacific,tot_edu_pacific_male,tot_edu_pacific_female,tot_edu_pacific_hsplus,tot_edu_pacific_male_hsplus,tot_edu_pacific_female_hsplus,tot_edu_pacific_4yplus,tot_edu_pacific_male_4yplus,tot_edu_pacific_female_4yplus,tot_edu_other,tot_edu_other_male,tot_edu_other_female,tot_edu_other_hsplus,tot_edu_other_male_hsplus,tot_edu_other_female_hsplus,tot_edu_other_4yplus,tot_edu_other_male_4yplus,tot_edu_other_female_4yplus,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state
3218,72151,0,0,0,0,0,0,0,0,0,15706,7363,8343,10864,4831,6033,2677,865,1812,39,0,39,39,0,39,39,0,39,12,12,0,12,12,0,12,12,0,0,0,0,0,0,0,0,0,0,450,233,217,315,179,136,114,66,48,349,156,193,180,63,117,45,9,36,23904,11323,12581,16684,7409,9275,4431,1519,2912,23916,4975,2245,5972,3636,2645,3706,737,Yabucoa Municipio,Puerto Rico
3219,72153,29,16,13,19,6,13,5,0,5,925,471,454,652,329,323,235,100,135,75,35,40,56,16,40,18,8,10,5,5,0,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,4430,2202,2228,3131,1513,1618,830,288,542,292,127,165,168,85,83,69,35,34,25926,12022,13904,18700,8291,10409,6365,2430,3935,25976,4977,2259,8182,2381,1791,4902,1484,Yauco Municipio,Puerto Rico
3220,29JOP,29516,13887,15629,26830,12641,14189,7607,3766,3841,963,493,470,874,435,439,173,144,29,653,261,392,497,243,254,92,32,60,523,213,310,449,171,278,234,74,160,38,0,38,0,0,0,0,0,0,265,129,136,159,74,85,68,29,39,905,490,415,836,433,403,266,196,70,1136,500,636,788,384,404,180,78,102,33571,779,2580,10582,8462,2576,5759,2833,Joplin,Missouri
3221,29KAN,196115,96625,99490,184949,90966,93983,86232,41929,44303,87359,37954,49405,75251,32052,43199,13994,5465,8529,1537,750,787,1318,654,664,376,134,242,8573,4180,4393,6870,3402,3468,4038,2030,2008,398,162,236,292,141,151,104,97,7,10735,5533,5202,6278,3387,2891,1062,603,459,6881,3436,3445,6173,3063,3110,2637,1119,1518,26509,13802,12707,18460,9584,8876,4559,2335,2224,325065,11373,22302,82996,73203,23673,69682,41836,Kansas City,Missouri
3222,36NYC,2064537,997110,1067427,1924581,929762,994819,1206238,580787,625451,1398115,601774,796341,1160648,495537,665111,338438,128228,210210,24268,12218,12050,16850,8277,8573,4195,1866,2329,868465,404417,464048,656860,313388,343472,362529,170329,192200,3124,1501,1623,2552,1203,1349,781,450,331,797060,374668,422392,511112,240567,270545,110988,46547,64441,163206,75108,88098,130813,59869,70944,60444,25802,34642,1571933,740285,831648,1064667,500637,564030,281071,120324,160747,5923498,565345,523873,1421617,815961,379457,1292814,924431,New York City,New York


In [48]:
edu_df['edu'] = (edu_df['some_hs'] + 2*edu_df['hs'] \
                 + 3*edu_df['some_college'] + 4*edu_df['associates'] \
                 + 5*edu_df['bachelors'] + 6*edu_df['graduate'])\
                / edu_df['pop_25p']
for race in ['white', 'black', 'native', 'asian', 'pacific', 'other', 'twoplus', 'hispanic']:
    for sex in ['', '_male', '_female']:
        edu_df[f'edu_{race}{sex}'] = (2*edu_df[f'tot_edu_{race}{sex}_hsplus']
                                      + 5*edu_df[f'tot_edu_{race}{sex}_4yplus'])\
                                     / edu_df[f'tot_edu_{race}{sex}']
        edu_df[f'per_edu_{race}{sex}_nohs'] = (edu_df[f'tot_edu_{race}{sex}']
                                           - edu_df[f'tot_edu_{race}{sex}_hsplus'])\
                                          / edu_df[f'tot_edu_{race}{sex}']
# edu_df = edu_df.fillna(-1)
edu_df.tail()

Unnamed: 0,fips,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,tot_edu_black,tot_edu_black_male,tot_edu_black_female,tot_edu_black_hsplus,tot_edu_black_male_hsplus,tot_edu_black_female_hsplus,tot_edu_black_4yplus,tot_edu_black_male_4yplus,tot_edu_black_female_4yplus,tot_edu_native,tot_edu_native_male,tot_edu_native_female,tot_edu_native_hsplus,tot_edu_native_male_hsplus,tot_edu_native_female_hsplus,tot_edu_native_4yplus,tot_edu_native_male_4yplus,tot_edu_native_female_4yplus,tot_edu_asian,tot_edu_asian_male,tot_edu_asian_female,tot_edu_asian_hsplus,tot_edu_asian_male_hsplus,tot_edu_asian_female_hsplus,tot_edu_asian_4yplus,tot_edu_asian_male_4yplus,tot_edu_asian_female_4yplus,tot_edu_pacific,tot_edu_pacific_male,tot_edu_pacific_female,tot_edu_pacific_hsplus,tot_edu_pacific_male_hsplus,tot_edu_pacific_female_hsplus,tot_edu_pacific_4yplus,tot_edu_pacific_male_4yplus,tot_edu_pacific_female_4yplus,tot_edu_other,tot_edu_other_male,tot_edu_other_female,tot_edu_other_hsplus,tot_edu_other_male_hsplus,tot_edu_other_female_hsplus,tot_edu_other_4yplus,tot_edu_other_male_4yplus,tot_edu_other_female_4yplus,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state,edu,edu_white,per_edu_white_nohs,edu_white_male,per_edu_white_male_nohs,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs
3218,72151,0,0,0,0,0,0,0,0,0,15706,7363,8343,10864,4831,6033,2677,865,1812,39,0,39,39,0,39,39,0,39,12,12,0,12,12,0,12,12,0,0,0,0,0,0,0,0,0,0,450,233,217,315,179,136,114,66,48,349,156,193,180,63,117,45,9,36,23904,11323,12581,16684,7409,9275,4431,1519,2912,23916,4975,2245,5972,3636,2645,3706,737,Yabucoa Municipio,Puerto Rico,2.451455,,,,,,,2.235642,0.30829,1.899633,0.343882,2.532183,0.276879,7.0,0.0,,,7.0,0.0,7.0,0.0,7.0,0.0,,,,,,,,,2.666667,0.3,2.95279,0.23176,2.359447,0.373272,1.676218,0.484241,1.096154,0.596154,2.145078,0.393782,2.322749,0.302041,1.979422,0.345668,2.631746,0.262777
3219,72153,29,16,13,19,6,13,5,0,5,925,471,454,652,329,323,235,100,135,75,35,40,56,16,40,18,8,10,5,5,0,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,4430,2202,2228,3131,1513,1618,830,288,542,292,127,165,168,85,83,69,35,34,25926,12022,13904,18700,8291,10409,6365,2430,3935,25976,4977,2259,8182,2381,1791,4902,1484,Yauco Municipio,Puerto Rico,2.55405,2.172414,0.344828,0.75,0.625,3.923077,0.0,2.68,0.295135,2.458599,0.301486,2.909692,0.288546,2.693333,0.253333,2.057143,0.542857,3.25,0.0,2.0,0.0,2.0,0.0,,,,,,,,,2.350339,0.293228,2.028156,0.312897,2.668761,0.273788,2.332192,0.424658,2.716535,0.330709,2.036364,0.49697,2.6701,0.278716,2.389952,0.310348,2.912327,0.251367
3220,29JOP,29516,13887,15629,26830,12641,14189,7607,3766,3841,963,493,470,874,435,439,173,144,29,653,261,392,497,243,254,92,32,60,523,213,310,449,171,278,234,74,160,38,0,38,0,0,0,0,0,0,265,129,136,159,74,85,68,29,39,905,490,415,836,433,403,266,196,70,1136,500,636,788,384,404,180,78,102,33571,779,2580,10582,8462,2576,5759,2833,Joplin,Missouri,3.134461,3.10662,0.091001,3.176496,0.089724,3.044533,0.092136,2.713396,0.09242,3.225152,0.117647,2.176596,0.065957,2.226646,0.238897,2.475096,0.068966,2.061224,0.352041,3.954111,0.141491,3.342723,0.197183,4.374194,0.103226,0.0,1.0,,,0.0,1.0,2.483019,0.4,2.271318,0.426357,2.683824,0.375,3.317127,0.076243,3.767347,0.116327,2.785542,0.028916,2.179577,0.306338,2.316,0.232,2.072327,0.36478
3221,29KAN,196115,96625,99490,184949,90966,93983,86232,41929,44303,87359,37954,49405,75251,32052,43199,13994,5465,8529,1537,750,787,1318,654,664,376,134,242,8573,4180,4393,6870,3402,3468,4038,2030,2008,398,162,236,292,141,151,104,97,7,10735,5533,5202,6278,3387,2891,1062,603,459,6881,3436,3445,6173,3063,3110,2637,1119,1518,26509,13802,12707,18460,9584,8876,4559,2335,2224,325065,11373,22302,82996,73203,23673,69682,41836,Kansas City,Missouri,3.390156,4.084634,0.056936,4.052543,0.058567,4.115801,0.055352,2.523747,0.1386,2.408942,0.155504,2.611942,0.125615,2.938191,0.142485,2.637333,0.128,3.224905,0.15629,3.957774,0.198647,4.055981,0.186124,3.86433,0.210562,2.773869,0.266332,4.734568,0.12963,1.427966,0.360169,1.664276,0.415184,1.769203,0.387855,1.552672,0.444252,3.710362,0.102892,3.411234,0.108556,4.008708,0.097242,2.252631,0.303633,2.234676,0.305608,2.272133,0.301487
3222,36NYC,2064537,997110,1067427,1924581,929762,994819,1206238,580787,625451,1398115,601774,796341,1160648,495537,665111,338438,128228,210210,24268,12218,12050,16850,8277,8573,4195,1866,2329,868465,404417,464048,656860,313388,343472,362529,170329,192200,3124,1501,1623,2552,1203,1349,781,450,331,797060,374668,422392,511112,240567,270545,110988,46547,64441,163206,75108,88098,130813,59869,70944,60444,25802,34642,1571933,740285,831648,1064667,500637,564030,281071,120324,160747,5923498,565345,523873,1421617,815961,379457,1292814,924431,New York City,New York,3.265549,4.785747,0.067791,4.777265,0.067543,4.79367,0.068022,2.870641,0.169848,2.712337,0.17654,2.990267,0.164791,2.252967,0.30567,2.118514,0.322557,2.389295,0.288548,3.599874,0.243654,3.655685,0.225087,3.551236,0.259835,2.883803,0.183099,3.101932,0.198534,2.68207,0.168823,1.978727,0.358753,1.905338,0.35792,2.043824,0.359493,3.454812,0.198479,3.311871,0.202894,3.576676,0.194715,2.248626,0.322702,2.165239,0.323724,2.322852,0.321792


In [49]:
with open('../data/processed/edu_df.p', 'wb') as file:
    pickle.dump(edu_df, file, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
# edu_df = pd.read_csv('../data/processed/edu_df.csv')

In [51]:
info_df = info_df.merge(
    edu_df[['fips']+edu_df.select_dtypes(include='number').columns.tolist()], 
    on='fips', 
    how='left'
)
info_df.tail()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,...,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,edu,edu_white,per_edu_white_nohs,edu_white_male,per_edu_white_male_nohs,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs
3140,56,Wyoming,Sweetwater,56037,42343,21808.0,20535.0,17223.0,16338.0,298.0,204.0,177.0,158.0,183.0,227.0,26.0,29.0,350.0,358.0,3551.0,3221.0,33561,502,335,410,55,708,6772,8.94558,8.95502,8.934678,9.201322,9.167117,10.085106,8.962963,8.666667,7.542857,7.0,7.970588,2.0,14.0,5.837838,6.323077,6.616402,6.753086,9.185418,9.675676,8.175,7.485294,8.0,6.064748,6.679487,West,Mountain,27005.754244,-108.882788,41.659439,"[56007, 56013, 56023, 56035, 56041]",3233.0,12153.0,16661.0,0.789874,73008.0,,62188.0,72614.0,138053.0,74189.0,55284.0,61921.0,76469.0,23640,12157,11483,22176,...,257,121,136,257,121,136,69,19,50,3774,2017,1757,2735,1389,1346,524,140,384,28333,633,1916,9433,6994,3114,4298,1945,3.084036,3.037521,0.061929,2.939706,0.06194,3.141078,0.061918,4.245714,0.005714,4.10084,0.0,4.553571,0.017857,1.651543,0.174229,1.724138,0.137931,1.551724,0.224138,3.966019,0.169903,4.583333,0.0,3.712329,0.239726,2.0,0.0,2.0,0.0,,,2.09816,0.134969,1.590698,0.204651,3.081081,0.0,3.342412,0.0,2.785124,0.0,3.838235,0.0,2.143614,0.275305,1.724343,0.311353,2.624929,0.233921
3141,56,Wyoming,Teton,56039,23464,12142.0,11322.0,9832.0,9168.0,87.0,58.0,33.0,41.0,135.0,243.0,11.0,7.0,160.0,135.0,1884.0,1670.0,19000,145,74,378,18,295,3554,7.942493,7.886051,8.002435,8.187366,8.316012,7.614094,7.990196,7.960452,8.310127,6.431694,8.026432,7.038462,7.689655,6.022857,5.368715,6.708533,6.6914,8.249993,7.766932,8.125373,7.314634,7.381818,5.69209,6.700384,West,Mountain,10351.784301,-110.589071,43.935211,"[56013, 56023, 56029, 56035]",7313.0,3920.0,12176.0,0.348972,83831.0,,,98125.0,,36433.0,17188.0,45361.0,95222.0,14776,7716,7060,14565,...,52,52,0,52,52,0,51,51,0,2053,978,1075,1367,579,788,269,0,269,17164,457,501,2272,3219,868,6488,3359,4.123048,5.146521,0.01428,5.023328,0.01957,5.281161,0.008499,4.470588,0.235294,2.0,0.0,5.0,0.285714,2.0,0.0,2.0,0.0,2.0,0.0,4.839286,0.232143,7.0,0.0,3.441176,0.382353,0.0,1.0,0.0,1.0,,,2.201691,0.264493,1.311258,0.344371,2.712928,0.218631,6.903846,0.0,6.903846,0.0,,,1.986849,0.334145,1.184049,0.407975,2.717209,0.266977
3142,56,Wyoming,Uinta,56041,20226,10224.0,10002.0,8935.0,8722.0,64.0,62.0,69.0,75.0,36.0,56.0,11.0,13.0,182.0,130.0,927.0,944.0,17657,126,144,92,24,312,1871,8.657688,8.632103,8.685126,9.090419,9.172339,9.16092,8.896552,9.424242,8.878049,7.711111,7.452675,10.090909,9.0,6.05625,6.362963,6.478238,6.364072,9.129947,9.055172,9.121622,7.544974,9.666667,6.19661,6.424592,West,Mountain,5391.631764,-110.547578,41.287818,"[56023, 56037]",1202.0,6154.0,8053.0,0.836596,58235.0,,68125.0,,,55701.0,39205.0,39816.0,61330.0,11678,5807,5871,10942,...,350,180,170,336,179,157,14,14,0,829,430,399,651,361,290,37,12,25,12915,288,646,5176,3420,1390,1356,639,2.898335,2.688303,0.063024,2.638023,0.084381,2.738034,0.041901,4.7,0.15,7.0,0.0,1.25,0.375,2.435644,0.079208,2.8,0.145455,2.0,0.0,7.0,0.0,,,7.0,0.0,,,,,,,1.451777,0.274112,1.6875,0.15625,1.014493,0.492754,2.12,0.04,2.377778,0.005556,1.847059,0.076471,1.793727,0.214717,1.818605,0.160465,1.766917,0.273183
3143,56,Wyoming,Washakie,56043,7805,3963.0,3842.0,3266.0,3151.0,23.0,15.0,19.0,34.0,22.0,33.0,0.0,0.0,68.0,66.0,565.0,543.0,6417,38,53,55,0,134,1108,8.025512,7.988165,8.063687,8.168103,8.313575,7.375,6.016129,8.014493,8.853333,7.0,8.607143,4.545455,7.076923,7.192308,5.892308,6.529666,6.106992,8.239961,6.706349,8.451389,7.978261,5.916667,6.650641,6.316408,West,Mountain,5798.138762,-107.680187,43.904516,"[56003, 56013, 56017, 56019, 56025, 56029]",532.0,2911.0,3715.0,0.845484,53426.0,,,,,62054.0,36118.0,50035.0,54815.0,4848,2425,2423,4418,...,210,72,138,202,64,138,9,0,9,632,324,308,480,218,262,59,24,35,5662,181,409,1717,1434,701,849,371,3.076651,2.98288,0.088696,3.027629,0.13567,2.938093,0.041684,,,,,,,5.857143,0.0,5.857143,0.0,,,,,,,,,,,,,,,0.767442,0.616279,0.639175,0.680412,0.933333,0.533333,2.138095,0.038095,1.777778,0.111111,2.326087,0.0,1.985759,0.240506,1.716049,0.32716,2.269481,0.149351
3144,56,Wyoming,Weston,56045,6927,3624.0,3303.0,3273.0,2963.0,30.0,15.0,62.0,45.0,30.0,83.0,1.0,0.0,73.0,67.0,155.0,130.0,6236,45,107,113,1,140,285,9.065343,8.91673,9.218636,9.261788,9.638527,9.913043,8.466667,9.0,9.764706,8.409091,9.484848,,,7.029412,7.318182,7.125664,6.983425,9.446782,9.342105,9.490566,9.054545,,7.171642,7.055957,West,Mountain,6210.804116,-104.567368,43.840251,"[56005, 56009, 56011, 56027]",294.0,2898.0,3334.0,0.907895,52867.0,,,,,,,37870.0,55032.0,4689,2450,2239,4381,...,52,12,40,52,12,40,17,0,17,95,80,15,72,57,15,3,3,0,5014,129,260,1796,1334,534,676,285,3.007579,2.85498,0.065686,2.747755,0.066939,2.972309,0.064314,,,,,,,1.2,0.4,2.0,0.0,0.0,1.0,1.815029,0.323699,1.26087,0.369565,2.015748,0.307087,2.0,0.0,2.0,0.0,,,,,,,,,3.634615,0.0,2.0,0.0,4.125,0.0,1.673684,0.242105,1.6125,0.2875,2.0,0.0


# 6. add mask usage statistics

In [52]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv') as response:
    mask_df = pd.read_csv(response)
    
mask_df.rename(columns={'COUNTYFP':'fips'}, inplace=True)
mask_df['fips'] = mask_df['fips'].apply('{0:0>5}'.format)
mask_df.columns = mask_df.columns.str.lower()

mask_df.head()

Unnamed: 0,fips,never,rarely,sometimes,frequently,always
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.12,0.201,0.491
3,1007,0.02,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.18,0.194,0.459


In [53]:
nyc_mask_df = combine_counties(mask_df, using='fips', method='mean')
nyc_mask_df

Unnamed: 0,never,rarely,sometimes,frequently,always,fips
0,0.029802,0.022852,0.057863,0.137004,0.75248,36NYC


In [54]:
# estimates from averaging counties
kan_fips = ['29095', '29047', '29165', '29037']
jop_mask_df = combine_counties(mask_df, using='fips', method='mean', fips=jop_fips, end_fips='29JOP')
kan_mask_df = combine_counties(mask_df, using='fips', method='mean', fips=kan_fips, end_fips='29KAN')

In [55]:
mask_df = mask_df.append([nyc_mask_df, jop_mask_df, kan_mask_df], ignore_index=True)
mask_df.tail()

Unnamed: 0,fips,never,rarely,sometimes,frequently,always
3140,56043,0.204,0.155,0.069,0.285,0.287
3141,56045,0.142,0.129,0.148,0.207,0.374
3142,36NYC,0.029802,0.022852,0.057863,0.137004,0.75248
3143,29JOP,0.143838,0.141946,0.14473,0.187351,0.383135
3144,29KAN,0.029815,0.058889,0.092879,0.203477,0.614758


In [56]:
mask_df['mask'] = mask_df['rarely'] + 2*mask_df['sometimes']\
                  + 3*mask_df['frequently'] + 4*mask_df['always']
mask_df.tail()

Unnamed: 0,fips,never,rarely,sometimes,frequently,always,mask
3140,56043,0.204,0.155,0.069,0.285,0.287,2.296
3141,56045,0.142,0.129,0.148,0.207,0.374,2.542
3142,36NYC,0.029802,0.022852,0.057863,0.137004,0.75248,3.559508
3143,29JOP,0.143838,0.141946,0.14473,0.187351,0.383135,2.526
3144,29KAN,0.029815,0.058889,0.092879,0.203477,0.614758,3.314113


In [57]:
with open('../data/processed/mask_df.p', 'wb') as file:
    pickle.dump(mask_df, file, protocol=pickle.HIGHEST_PROTOCOL)

In [58]:
# mask_df = pd.read_csv('../data/processed/mask_df.csv')

In [59]:
info_df = info_df.merge(mask_df, on='fips', how='left')
info_df.tail()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,...,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,edu,edu_white,per_edu_white_nohs,edu_white_male,per_edu_white_male_nohs,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask
3140,56,Wyoming,Sweetwater,56037,42343,21808.0,20535.0,17223.0,16338.0,298.0,204.0,177.0,158.0,183.0,227.0,26.0,29.0,350.0,358.0,3551.0,3221.0,33561,502,335,410,55,708,6772,8.94558,8.95502,8.934678,9.201322,9.167117,10.085106,8.962963,8.666667,7.542857,7.0,7.970588,2.0,14.0,5.837838,6.323077,6.616402,6.753086,9.185418,9.675676,8.175,7.485294,8.0,6.064748,6.679487,West,Mountain,27005.754244,-108.882788,41.659439,"[56007, 56013, 56023, 56035, 56041]",3233.0,12153.0,16661.0,0.789874,73008.0,,62188.0,72614.0,138053.0,74189.0,55284.0,61921.0,76469.0,23640,12157,11483,22176,...,69,19,50,3774,2017,1757,2735,1389,1346,524,140,384,28333,633,1916,9433,6994,3114,4298,1945,3.084036,3.037521,0.061929,2.939706,0.06194,3.141078,0.061918,4.245714,0.005714,4.10084,0.0,4.553571,0.017857,1.651543,0.174229,1.724138,0.137931,1.551724,0.224138,3.966019,0.169903,4.583333,0.0,3.712329,0.239726,2.0,0.0,2.0,0.0,,,2.09816,0.134969,1.590698,0.204651,3.081081,0.0,3.342412,0.0,2.785124,0.0,3.838235,0.0,2.143614,0.275305,1.724343,0.311353,2.624929,0.233921,0.061,0.295,0.23,0.146,0.268,2.265
3141,56,Wyoming,Teton,56039,23464,12142.0,11322.0,9832.0,9168.0,87.0,58.0,33.0,41.0,135.0,243.0,11.0,7.0,160.0,135.0,1884.0,1670.0,19000,145,74,378,18,295,3554,7.942493,7.886051,8.002435,8.187366,8.316012,7.614094,7.990196,7.960452,8.310127,6.431694,8.026432,7.038462,7.689655,6.022857,5.368715,6.708533,6.6914,8.249993,7.766932,8.125373,7.314634,7.381818,5.69209,6.700384,West,Mountain,10351.784301,-110.589071,43.935211,"[56013, 56023, 56029, 56035]",7313.0,3920.0,12176.0,0.348972,83831.0,,,98125.0,,36433.0,17188.0,45361.0,95222.0,14776,7716,7060,14565,...,51,51,0,2053,978,1075,1367,579,788,269,0,269,17164,457,501,2272,3219,868,6488,3359,4.123048,5.146521,0.01428,5.023328,0.01957,5.281161,0.008499,4.470588,0.235294,2.0,0.0,5.0,0.285714,2.0,0.0,2.0,0.0,2.0,0.0,4.839286,0.232143,7.0,0.0,3.441176,0.382353,0.0,1.0,0.0,1.0,,,2.201691,0.264493,1.311258,0.344371,2.712928,0.218631,6.903846,0.0,6.903846,0.0,,,1.986849,0.334145,1.184049,0.407975,2.717209,0.266977,0.095,0.157,0.16,0.247,0.34,2.578
3142,56,Wyoming,Uinta,56041,20226,10224.0,10002.0,8935.0,8722.0,64.0,62.0,69.0,75.0,36.0,56.0,11.0,13.0,182.0,130.0,927.0,944.0,17657,126,144,92,24,312,1871,8.657688,8.632103,8.685126,9.090419,9.172339,9.16092,8.896552,9.424242,8.878049,7.711111,7.452675,10.090909,9.0,6.05625,6.362963,6.478238,6.364072,9.129947,9.055172,9.121622,7.544974,9.666667,6.19661,6.424592,West,Mountain,5391.631764,-110.547578,41.287818,"[56023, 56037]",1202.0,6154.0,8053.0,0.836596,58235.0,,68125.0,,,55701.0,39205.0,39816.0,61330.0,11678,5807,5871,10942,...,14,14,0,829,430,399,651,361,290,37,12,25,12915,288,646,5176,3420,1390,1356,639,2.898335,2.688303,0.063024,2.638023,0.084381,2.738034,0.041901,4.7,0.15,7.0,0.0,1.25,0.375,2.435644,0.079208,2.8,0.145455,2.0,0.0,7.0,0.0,,,7.0,0.0,,,,,,,1.451777,0.274112,1.6875,0.15625,1.014493,0.492754,2.12,0.04,2.377778,0.005556,1.847059,0.076471,1.793727,0.214717,1.818605,0.160465,1.766917,0.273183,0.098,0.278,0.154,0.207,0.264,2.263
3143,56,Wyoming,Washakie,56043,7805,3963.0,3842.0,3266.0,3151.0,23.0,15.0,19.0,34.0,22.0,33.0,0.0,0.0,68.0,66.0,565.0,543.0,6417,38,53,55,0,134,1108,8.025512,7.988165,8.063687,8.168103,8.313575,7.375,6.016129,8.014493,8.853333,7.0,8.607143,4.545455,7.076923,7.192308,5.892308,6.529666,6.106992,8.239961,6.706349,8.451389,7.978261,5.916667,6.650641,6.316408,West,Mountain,5798.138762,-107.680187,43.904516,"[56003, 56013, 56017, 56019, 56025, 56029]",532.0,2911.0,3715.0,0.845484,53426.0,,,,,62054.0,36118.0,50035.0,54815.0,4848,2425,2423,4418,...,9,0,9,632,324,308,480,218,262,59,24,35,5662,181,409,1717,1434,701,849,371,3.076651,2.98288,0.088696,3.027629,0.13567,2.938093,0.041684,,,,,,,5.857143,0.0,5.857143,0.0,,,,,,,,,,,,,,,0.767442,0.616279,0.639175,0.680412,0.933333,0.533333,2.138095,0.038095,1.777778,0.111111,2.326087,0.0,1.985759,0.240506,1.716049,0.32716,2.269481,0.149351,0.204,0.155,0.069,0.285,0.287,2.296
3144,56,Wyoming,Weston,56045,6927,3624.0,3303.0,3273.0,2963.0,30.0,15.0,62.0,45.0,30.0,83.0,1.0,0.0,73.0,67.0,155.0,130.0,6236,45,107,113,1,140,285,9.065343,8.91673,9.218636,9.261788,9.638527,9.913043,8.466667,9.0,9.764706,8.409091,9.484848,,,7.029412,7.318182,7.125664,6.983425,9.446782,9.342105,9.490566,9.054545,,7.171642,7.055957,West,Mountain,6210.804116,-104.567368,43.840251,"[56005, 56009, 56011, 56027]",294.0,2898.0,3334.0,0.907895,52867.0,,,,,,,37870.0,55032.0,4689,2450,2239,4381,...,17,0,17,95,80,15,72,57,15,3,3,0,5014,129,260,1796,1334,534,676,285,3.007579,2.85498,0.065686,2.747755,0.066939,2.972309,0.064314,,,,,,,1.2,0.4,2.0,0.0,0.0,1.0,1.815029,0.323699,1.26087,0.369565,2.015748,0.307087,2.0,0.0,2.0,0.0,,,,,,,,,3.634615,0.0,2.0,0.0,4.125,0.0,1.673684,0.242105,1.6125,0.2875,2.0,0.0,0.142,0.129,0.148,0.207,0.374,2.542


In [60]:
info_df.columns

Index(['state_fips', 'state', 'county', 'fips', 'tot_pop', 'tot_male',
       'tot_female', 'tot_pop_white_male', 'tot_pop_white_female',
       'tot_pop_black_male',
       ...
       'edu_hispanic_male', 'per_edu_hispanic_male_nohs',
       'edu_hispanic_female', 'per_edu_hispanic_female_nohs', 'never',
       'rarely', 'sometimes', 'frequently', 'always', 'mask'],
      dtype='object', length=206)

In [61]:
def per_population(df, divisor='tot_pop', ignore=['tot_pop']):
    
    cols = [c for c in df.columns.tolist() if c[:4]=='tot_']
    
    for c in cols:
        if c not in ignore:
            df[c.replace('tot_', 'per_')] = df[c] / df[divisor]
    
    return df

In [62]:
info_df['pop_density'] = info_df['tot_pop'] / info_df['area_land']

edu_cols_to_ignore = [c for c in info_df.columns if 'edu' in c]
info_df = info_df.pipe(
    per_population, 
    ignore=['tot_pop', 'tot_dem', 'tot_gop']+edu_cols_to_ignore
)
info_df.head()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,...,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask,pop_density,per_male,per_female,per_pop_white_male,per_pop_white_female,per_pop_black_male,per_pop_black_female,per_pop_native_male,per_pop_native_female,per_pop_asian_male,per_pop_asian_female,per_pop_pacific_male,per_pop_pacific_female,per_pop_twoplus_male,per_pop_twoplus_female,per_pop_hispanic_male,per_pop_hispanic_female,per_pop_white,per_pop_black,per_pop_native,per_pop_asian,per_pop_pacific,per_pop_twoplus,per_pop_hispanic,per_votes
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,5927.0,105.0,138.0,282.0,364.0,20.0,20.0,492.0,464.0,884.0,787.0,41215,11098,243,646,40,956,1671,8.422041,8.181973,8.648052,8.518373,8.928216,7.480758,8.104269,9.733333,9.826087,7.276596,8.381868,8.4,5.85,5.195122,5.519397,6.382353,7.072427,8.727963,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,South,East South Central,1539.602123,-86.643648,32.538666,"[01021, 01047, 01051, 01085, 01101]",5908.0,18110.0,24661.0,0.754018,58786.0,27643.0,,,,,,83423.0,65047.0,28726,13834,14892,26130,...,3.117043,0.090653,2.563808,0.19555,2.543393,0.199211,2.580395,0.192575,3.166667,0.217949,4.487179,0.358974,1.846154,0.076923,3.615142,0.123028,3.568421,0.347368,3.635135,0.027027,2.0,0.0,2.0,0.0,2.0,0.0,1.503817,0.248092,1.44086,0.27957,1.538462,0.230769,2.626316,0.292105,1.081481,0.459259,3.477551,0.2,3.591054,0.125666,4.197802,0.164835,3.020661,0.088843,0.053,0.074,0.134,0.295,0.444,3.003,36.287947,0.48492,0.51508,0.36045,0.377258,0.092556,0.106087,0.001879,0.00247,0.005048,0.006515,0.000358,0.000358,0.008806,0.008305,0.015823,0.014087,0.737708,0.198643,0.004349,0.011563,0.000716,0.017111,0.029909,0.441408
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,9907.0,753.0,754.0,911.0,1435.0,53.0,70.0,1832.0,1930.0,5545.0,4989.0,185747,19215,1507,2346,123,3762,10534,8.987202,8.806627,9.157192,9.150993,9.481721,7.587129,8.095791,9.14741,8.851459,7.246981,7.887108,8.150943,8.628571,5.757642,5.864767,6.497565,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,South,East South Central,4117.546676,-87.722603,30.729584,"[01025, 01053, 01097, 01099, 01129]",18409.0,72780.0,94090.0,0.798123,55962.0,31112.0,53289.0,34763.0,,45634.0,53456.0,43279.0,59418.0,126316,60310,66006,116288,...,3.513696,0.068267,2.494586,0.203315,2.19882,0.261756,2.752534,0.152347,2.270936,0.221675,2.342256,0.216061,2.195122,0.227642,2.690678,0.169492,2.997653,0.36385,2.517241,0.059682,2.0,0.0,,,2.0,0.0,2.878465,0.259062,2.948827,0.159915,2.808102,0.358209,3.418808,0.106893,2.910903,0.127784,3.923166,0.086147,2.850361,0.253174,2.477628,0.340487,3.2827,0.151899,0.083,0.059,0.098,0.323,0.436,2.968,54.215293,0.484904,0.515096,0.40247,0.429603,0.041696,0.044379,0.003373,0.003378,0.004081,0.006428,0.000237,0.000314,0.008207,0.008646,0.024839,0.022349,0.832073,0.086076,0.006751,0.010509,0.000551,0.016852,0.047188,0.421486
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,5547.0,52.0,43.0,55.0,61.0,21.0,10.0,153.0,132.0,629.0,488.0,11235,11807,95,116,31,285,1117,8.784412,8.463564,9.14507,9.475568,10.196592,7.767732,8.491437,9.346154,10.465116,8.109091,10.065574,6.285714,7.7,6.418301,5.810606,6.434022,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central,2292.144655,-85.387579,31.868235,"[01011, 01045, 01067, 01109, 01113]",4848.0,5431.0,10390.0,0.528359,34186.0,23013.0,,50417.0,,26793.0,19760.0,30417.0,47031.0,9171,4846,4325,7264,...,2.551214,0.166012,1.703576,0.317808,1.568773,0.355019,1.854944,0.276024,1.166667,0.416667,1.166667,0.416667,,,1.920455,0.181818,1.35,0.325,2.395833,0.0625,0.0,1.0,,,0.0,1.0,1.217391,0.710145,1.617391,0.669565,0.417391,0.791304,1.918033,0.163934,1.25,0.375,2.436893,0.0,1.539267,0.561955,1.731646,0.513924,1.11236,0.668539,0.067,0.121,0.12,0.201,0.491,2.928,10.769826,0.529207,0.470793,0.238759,0.216357,0.253585,0.224702,0.002106,0.001742,0.002228,0.002471,0.000851,0.000405,0.006198,0.005347,0.02548,0.019768,0.455116,0.478287,0.003848,0.004699,0.001256,0.011545,0.045248,0.420886
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,1807.0,50.0,41.0,21.0,25.0,5.0,1.0,116.0,130.0,343.0,280.0,16663,4719,91,46,6,246,623,8.606145,8.349484,8.89871,8.645838,9.105122,7.706387,8.455451,8.52,8.219512,7.52381,9.8,6.4,3.0,6.422414,6.3,7.186589,6.975,8.871332,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,South,East South Central,1612.167481,-87.125115,32.996421,"[01021, 01065, 01073, 01105, 01117, 01125]",1874.0,6733.0,8748.0,0.78227,45340.0,34000.0,,,,,20329.0,42708.0,50769.0,12002,6037,5965,10483,...,2.528751,0.111148,1.686369,0.3076,1.476701,0.358341,2.07094,0.21453,2.0,0.0,2.0,0.0,,,7.0,0.0,7.0,0.0,7.0,0.0,,,,,,,2.0,0.0,2.0,0.0,,,1.651007,0.275168,1.925926,0.175926,0.926829,0.536585,1.316294,0.341853,1.111111,0.444444,1.56338,0.21831,0.02,0.034,0.096,0.278,0.572,3.348,13.890616,0.532687,0.467313,0.378762,0.365321,0.130035,0.080691,0.002233,0.001831,0.000938,0.001116,0.000223,4.5e-05,0.00518,0.005805,0.015317,0.012503,0.744083,0.210726,0.004064,0.002054,0.000268,0.010985,0.02782,0.39064
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,419.0,143.0,139.0,73.0,90.0,14.0,7.0,345.0,385.0,2950.0,2632.0,50176,872,282,163,21,730,5582,8.651714,8.49136,8.807249,8.771209,9.131843,8.030905,7.947494,10.125874,9.446043,8.30137,9.066667,8.142857,10.571429,6.486957,6.535065,6.4,6.06193,8.955796,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,South,East South Central,1670.103911,-86.568495,33.98143,"[01043, 01055, 01073, 01095, 01115, 01127]",2150.0,22808.0,25384.0,0.913855,48695.0,,65385.0,99219.0,,,44934.0,35495.0,49872.0,35774,17200,18574,29814,...,2.45876,0.157586,1.563758,0.310403,1.544484,0.316726,1.580952,0.304762,1.916667,0.287879,4.227273,0.363636,1.454545,0.272727,4.177419,0.16129,4.906977,0.0,3.790123,0.246914,2.0,0.0,,,2.0,0.0,3.270142,0.43128,2.377358,0.471698,4.171429,0.390476,1.913636,0.179545,1.971698,0.273585,1.859649,0.092105,0.927969,0.614559,0.811989,0.675749,1.077058,0.535902,0.053,0.114,0.18,0.194,0.459,2.892,34.624193,0.492374,0.507626,0.423581,0.444125,0.007834,0.007246,0.002473,0.002404,0.001262,0.001556,0.000242,0.000121,0.005966,0.006658,0.051015,0.045516,0.867707,0.01508,0.004877,0.002819,0.000363,0.012624,0.096531,0.438972


In [63]:
info_df[info_df.isna().any(axis=1)]

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,...,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask,pop_density,per_male,per_female,per_pop_white_male,per_pop_white_female,per_pop_black_male,per_pop_black_female,per_pop_native_male,per_pop_native_female,per_pop_asian_male,per_pop_asian_female,per_pop_pacific_male,per_pop_pacific_female,per_pop_twoplus_male,per_pop_twoplus_female,per_pop_hispanic_male,per_pop_hispanic_female,per_pop_white,per_pop_black,per_pop_native,per_pop_asian,per_pop_pacific,per_pop_twoplus,per_pop_hispanic,per_votes
0,01,Alabama,Autauga,01001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,5927.0,105.0,138.0,282.0,364.0,20.0,20.0,492.0,464.0,884.0,787.0,41215,11098,243,646,40,956,1671,8.422041,8.181973,8.648052,8.518373,8.928216,7.480758,8.104269,9.733333,9.826087,7.276596,8.381868,8.400000,5.850000,5.195122,5.519397,6.382353,7.072427,8.727963,7.813750,9.786008,7.899381,7.125000,5.352510,6.707361,South,East South Central,1539.602123,-86.643648,32.538666,"[01021, 01047, 01051, 01085, 01101]",5908.0,18110.0,24661.0,0.754018,58786.0,27643.0,,,,,,83423.0,65047.0,28726,13834,14892,26130,...,3.117043,0.090653,2.563808,0.195550,2.543393,0.199211,2.580395,0.192575,3.166667,0.217949,4.487179,0.358974,1.846154,0.076923,3.615142,0.123028,3.568421,0.347368,3.635135,0.027027,2.0,0.0,2.0,0.0,2.0,0.0,1.503817,0.248092,1.440860,0.279570,1.538462,0.230769,2.626316,0.292105,1.081481,0.459259,3.477551,0.200000,3.591054,0.125666,4.197802,0.164835,3.020661,0.088843,0.053,0.074,0.134,0.295,0.444,3.003,36.287947,0.484920,0.515080,0.360450,0.377258,0.092556,0.106087,0.001879,0.002470,0.005048,0.006515,0.000358,0.000358,0.008806,0.008305,0.015823,0.014087,0.737708,0.198643,0.004349,0.011563,0.000716,0.017111,0.029909,0.441408
1,01,Alabama,Baldwin,01003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,9907.0,753.0,754.0,911.0,1435.0,53.0,70.0,1832.0,1930.0,5545.0,4989.0,185747,19215,1507,2346,123,3762,10534,8.987202,8.806627,9.157192,9.150993,9.481721,7.587129,8.095791,9.147410,8.851459,7.246981,7.887108,8.150943,8.628571,5.757642,5.864767,6.497565,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.812600,6.602525,South,East South Central,4117.546676,-87.722603,30.729584,"[01025, 01053, 01097, 01099, 01129]",18409.0,72780.0,94090.0,0.798123,55962.0,31112.0,53289.0,34763.0,,45634.0,53456.0,43279.0,59418.0,126316,60310,66006,116288,...,3.513696,0.068267,2.494586,0.203315,2.198820,0.261756,2.752534,0.152347,2.270936,0.221675,2.342256,0.216061,2.195122,0.227642,2.690678,0.169492,2.997653,0.363850,2.517241,0.059682,2.0,0.0,,,2.0,0.0,2.878465,0.259062,2.948827,0.159915,2.808102,0.358209,3.418808,0.106893,2.910903,0.127784,3.923166,0.086147,2.850361,0.253174,2.477628,0.340487,3.282700,0.151899,0.083,0.059,0.098,0.323,0.436,2.968,54.215293,0.484904,0.515096,0.402470,0.429603,0.041696,0.044379,0.003373,0.003378,0.004081,0.006428,0.000237,0.000314,0.008207,0.008646,0.024839,0.022349,0.832073,0.086076,0.006751,0.010509,0.000551,0.016852,0.047188,0.421486
2,01,Alabama,Barbour,01005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,5547.0,52.0,43.0,55.0,61.0,21.0,10.0,153.0,132.0,629.0,488.0,11235,11807,95,116,31,285,1117,8.784412,8.463564,9.145070,9.475568,10.196592,7.767732,8.491437,9.346154,10.465116,8.109091,10.065574,6.285714,7.700000,6.418301,5.810606,6.434022,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central,2292.144655,-85.387579,31.868235,"[01011, 01045, 01067, 01109, 01113]",4848.0,5431.0,10390.0,0.528359,34186.0,23013.0,,50417.0,,26793.0,19760.0,30417.0,47031.0,9171,4846,4325,7264,...,2.551214,0.166012,1.703576,0.317808,1.568773,0.355019,1.854944,0.276024,1.166667,0.416667,1.166667,0.416667,,,1.920455,0.181818,1.350000,0.325000,2.395833,0.062500,0.0,1.0,,,0.0,1.0,1.217391,0.710145,1.617391,0.669565,0.417391,0.791304,1.918033,0.163934,1.250000,0.375000,2.436893,0.000000,1.539267,0.561955,1.731646,0.513924,1.112360,0.668539,0.067,0.121,0.120,0.201,0.491,2.928,10.769826,0.529207,0.470793,0.238759,0.216357,0.253585,0.224702,0.002106,0.001742,0.002228,0.002471,0.000851,0.000405,0.006198,0.005347,0.025480,0.019768,0.455116,0.478287,0.003848,0.004699,0.001256,0.011545,0.045248,0.420886
3,01,Alabama,Bibb,01007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,1807.0,50.0,41.0,21.0,25.0,5.0,1.0,116.0,130.0,343.0,280.0,16663,4719,91,46,6,246,623,8.606145,8.349484,8.898710,8.645838,9.105122,7.706387,8.455451,8.520000,8.219512,7.523810,9.800000,6.400000,3.000000,6.422414,6.300000,7.186589,6.975000,8.871332,7.993219,8.384615,8.760870,5.833333,6.357724,7.091493,South,East South Central,1612.167481,-87.125115,32.996421,"[01021, 01065, 01073, 01105, 01117, 01125]",1874.0,6733.0,8748.0,0.782270,45340.0,34000.0,,,,,20329.0,42708.0,50769.0,12002,6037,5965,10483,...,2.528751,0.111148,1.686369,0.307600,1.476701,0.358341,2.070940,0.214530,2.000000,0.000000,2.000000,0.000000,,,7.000000,0.000000,7.000000,0.000000,7.000000,0.000000,,,,,,,2.000000,0.000000,2.000000,0.000000,,,1.651007,0.275168,1.925926,0.175926,0.926829,0.536585,1.316294,0.341853,1.111111,0.444444,1.563380,0.218310,0.020,0.034,0.096,0.278,0.572,3.348,13.890616,0.532687,0.467313,0.378762,0.365321,0.130035,0.080691,0.002233,0.001831,0.000938,0.001116,0.000223,0.000045,0.005180,0.005805,0.015317,0.012503,0.744083,0.210726,0.004064,0.002054,0.000268,0.010985,0.027820,0.390640
4,01,Alabama,Blount,01009,57826,28472.0,29354.0,24494.0,25682.0,453.0,419.0,143.0,139.0,73.0,90.0,14.0,7.0,345.0,385.0,2950.0,2632.0,50176,872,282,163,21,730,5582,8.651714,8.491360,8.807249,8.771209,9.131843,8.030905,7.947494,10.125874,9.446043,8.301370,9.066667,8.142857,10.571429,6.486957,6.535065,6.400000,6.061930,8.955796,7.990826,9.790780,8.723926,8.952381,6.512329,6.240595,South,East South Central,1670.103911,-86.568495,33.981430,"[01043, 01055, 01073, 01095, 01115, 01127]",2150.0,22808.0,25384.0,0.913855,48695.0,,65385.0,99219.0,,,44934.0,35495.0,49872.0,35774,17200,18574,29814,...,2.458760,0.157586,1.563758,0.310403,1.544484,0.316726,1.580952,0.304762,1.916667,0.287879,4.227273,0.363636,1.454545,0.272727,4.177419,0.161290,4.906977,0.000000,3.790123,0.246914,2.0,0.0,,,2.0,0.0,3.270142,0.431280,2.377358,0.471698,4.171429,0.390476,1.913636,0.179545,1.971698,0.273585,1.859649,0.092105,0.927969,0.614559,0.811989,0.675749,1.077058,0.535902,0.053,0.114,0.180,0.194,0.459,2.892,34.624193,0.492374,0.507626,0.423581,0.444125,0.007834,0.007246,0.002473,0.002404,0.001262,0.001556,0.000242,0.000121,0.005966,0.006658,0.051015,0.045516,0.867707,0.015080,0.004877,0.002819,0.000363,0.012624,0.096531,0.438972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3140,56,Wyoming,Sweetwater,56037,42343,21808.0,20535.0,17223.0,16338.0,298.0,204.0,177.0,158.0,183.0,227.0,26.0,29.0,350.0,358.0,3551.0,3221.0,33561,502,335,410,55,708,6772,8.945580,8.955020,8.934678,9.201322,9.167117,10.085106,8.962963,8.666667,7.542857,7.000000,7.970588,2.000000,14.000000,5.837838,6.323077,6.616402,6.753086,9.185418,9.675676,8.175000,7.485294,8.000000,6.064748,6.679487,West,Mountain,27005.754244,-108.882788,41.659439,"[56007, 56013, 56023, 56035, 56041]",3233.0,12153.0,16661.0,0.789874,73008.0,,62188.0,72614.0,138053.0,74189.0,55284.0,61921.0,76469.0,23640,12157,11483,22176,...,3.141078,0.061918,4.245714,0.005714,4.100840,0.000000,4.553571,0.017857,1.651543,0.174229,1.724138,0.137931,1.551724,0.224138,3.966019,0.169903,4.583333,0.000000,3.712329,0.239726,2.0,0.0,2.0,0.0,,,2.098160,0.134969,1.590698,0.204651,3.081081,0.000000,3.342412,0.000000,2.785124,0.000000,3.838235,0.000000,2.143614,0.275305,1.724343,0.311353,2.624929,0.233921,0.061,0.295,0.230,0.146,0.268,2.265,1.567925,0.515032,0.484968,0.406750,0.385849,0.007038,0.004818,0.004180,0.003731,0.004322,0.005361,0.000614,0.000685,0.008266,0.008455,0.083863,0.076069,0.792599,0.011856,0.007912,0.009683,0.001299,0.016721,0.159932,0.393477
3141,56,Wyoming,Teton,56039,23464,12142.0,11322.0,9832.0,9168.0,87.0,58.0,33.0,41.0,135.0,243.0,11.0,7.0,160.0,135.0,1884.0,1670.0,19000,145,74,378,18,295,3554,7.942493,7.886051,8.002435,8.187366,8.316012,7.614094,7.990196,7.960452,8.310127,6.431694,8.026432,7.038462,7.689655,6.022857,5.368715,6.708533,6.691400,8.249993,7.766932,8.125373,7.314634,7.381818,5.692090,6.700384,West,Mountain,10351.784301,-110.589071,43.935211,"[56013, 56023, 56029, 56035]",7313.0,3920.0,12176.0,0.348972,83831.0,,,98125.0,,36433.0,17188.0,45361.0,95222.0,14776,7716,7060,14565,...,5.281161,0.008499,4.470588,0.235294,2.000000,0.000000,5.000000,0.285714,2.000000,0.000000,2.000000,0.000000,2.000000,0.000000,4.839286,0.232143,7.000000,0.000000,3.441176,0.382353,0.0,1.0,0.0,1.0,,,2.201691,0.264493,1.311258,0.344371,2.712928,0.218631,6.903846,0.000000,6.903846,0.000000,,,1.986849,0.334145,1.184049,0.407975,2.717209,0.266977,0.095,0.157,0.160,0.247,0.340,2.578,2.266662,0.517474,0.482526,0.419025,0.390726,0.003708,0.002472,0.001406,0.001747,0.005753,0.010356,0.000469,0.000298,0.006819,0.005753,0.080293,0.071173,0.809751,0.006180,0.003154,0.016110,0.000767,0.012572,0.151466,0.518923
3142,56,Wyoming,Uinta,56041,20226,10224.0,10002.0,8935.0,8722.0,64.0,62.0,69.0,75.0,36.0,56.0,11.0,13.0,182.0,130.0,927.0,944.0,17657,126,144,92,24,312,1871,8.657688,8.632103,8.685126,9.090419,9.172339,9.160920,8.896552,9.424242,8.878049,7.711111,7.452675,10.090909,9.000000,6.056250,6.362963,6.478238,6.364072,9.129947,9.055172,9.121622,7.544974,9.666667,6.196610,6.424592,West,Mountain,5391.631764,-110.547578,41.287818,"[56023, 56037]",1202.0,6154.0,8053.0,0.836596,58235.0,,68125.0,,,55701.0,39205.0,39816.0,61330.0,11678,5807,5871,10942,...,2.738034,0.041901,4.700000,0.150000,7.000000,0.000000,1.250000,0.375000,2.435644,0.079208,2.800000,0.145455,2.000000,0.000000,7.000000,0.000000,,,7.000000,0.000000,,,,,,,1.451777,0.274112,1.687500,0.156250,1.014493,0.492754,2.120000,0.040000,2.377778,0.005556,1.847059,0.076471,1.793727,0.214717,1.818605,0.160465,1.766917,0.273183,0.098,0.278,0.154,0.207,0.264,2.263,3.751369,0.505488,0.494512,0.441758,0.431227,0.003164,0.003065,0.003411,0.003708,0.001780,0.002769,0.000544,0.000643,0.008998,0.006427,0.045832,0.046673,0.872985,0.006230,0.007120,0.004549,0.001187,0.015426,0.092505,0.398151
3143,56,Wyoming,Washakie,56043,7805,3963.0,3842.0,3266.0,3151.0,23.0,15.0,19.0,34.0,22.0,33.0,0.0,0.0,68.0,66.0,565.0,543.0,6417,38,53,55,0,134,1108,8.025512,7.988165,8.063687,8.168103,8.313575,7.375000,6.016129,8.014493,8.853333,7.000000,8.607143,4.545455,7.076923,7.192308,5.892308,6.529666,6.106992,8.239961,6.706349,8.451389,7.978261,5.916667,6.650641,6.316408,West,Mountain,5798.138762,-107.680187,43.904516,"[56003, 56013, 56017, 56019, 56025, 56029]",532.0,2911.0,3715.0,0.845484,53426.0,,,,,62054.0,36118.0,50035.0,54815.0,4848,2425,2423,4418,...,2.938093,0.041684,,,,,,,5.857143,0.000000,5.857143,0.000000,,,,,,,,,,,,,,,0.767442,0.616279,0.639175,0.680412,0.933333,0.533333,2.138095,0.038095,1.777778,0.111111,2.326087,0.000000,1.985759,0.240506,1.716049,0.327160,2.269481,0.149351,0.204,0.155,0.069,0.285,0.287,2.296,1.346122,0.507751,0.492249,0.418450,0.403716,0.002947,0.001922,0.002434,0.004356,0.002819,0.004228,0.000000,0.000000,0.008712,0.008456,0.072389,0.069571,0.822165,0.004869,0.006791,0.007047,0.000000,0.017168,0.141960,0.475977


In [64]:
# nans only appear in votes, so we can just impute with 0.5 until we find better data
# info_df = info_df.fillna(0.5)

In [65]:
# don't want to double count nyc boroughs
for f in nyc_fips:
    info_df = info_df[info_df['fips'] != f]

# save results

In [66]:
with open('../data/processed/info_df.p', 'wb') as file:
    pickle.dump(info_df, file, protocol=pickle.HIGHEST_PROTOCOL)
info_df.to_csv('../data/processed/info_df.csv', index=False)

# Future Work: import Puerto Rico census data

To do:
- find detailed demographic data for Puerto Rico
- find a way to incorporate Puerto Rico into the Altair map

In [None]:
# with urlopen('https://www2.census.gov/programs-surveys/popest/tables/2010-2019/municipios/totals/prm-est2019-annres.xlsx') as response:
#     pr_df = pd.read_excel(response, header=3)
pr_df = pd.read_excel('data/prm-est2019-annres.xlsx', header=3)
pr_df = pr_df[['Unnamed: 0', 2019]]
pr_df.rename(
    columns={
        'Unnamed: 0':'county',
        2019:'total_pop'
    }, inplace=True
)
pr_df = pr_df[~pr_df['total_pop'].isna()]
pr_df['total_pop'] = pr_df['total_pop'].astype('int')
pr_df.head()

In [None]:
pr_df['county'] = [s[0] if len(s) > 0 else s for s in pr_df['county'].str.findall("\.([\w\s]+) Municipio\,.+")]
pr_df = pr_df.iloc[1:]          # removing the territory as a whole from the table
pr_df.head()

We also need to add `fips` codes for all of the municipios.

### import Puerto Rico `fips`

In [None]:
sess = HTMLSession()
res = sess.get('https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county')
table = res.html.find('table.wikitable > tbody > tr')
# puerto rico is fips 72
pr_fips = [[tr.find('td')[1].text, tr.find('td')[0].text] for tr in table[1:] if tr.find('td')[0].text[:2] == '72']
pr_fips_df = pd.DataFrame(pr_fips)
pr_fips_df.rename(
    columns={
        0:'county',
        1:'fips'
    }, inplace=True
)
pr_fips_df.head()

In [None]:
pr_fips_df['county'] = [s[0] if len(s) > 0 else s for s in pr_fips_df['county'].str.findall("([\w\s]+) Municipality")]
pr_fips_df.head()

In [None]:
len(list(set(pr_fips_df['county']) - set(pr_df['county'])))

In [None]:
pr_df = pr_df.merge(pr_fips_df, on='county')
pr_df['state'] = 'Puerto Rico'
pr_df.head()

In [None]:
pop_df = optimize(pop_df.append(pr_df, ignore_index=True).append(pr_df, ignore_index=True))
pop_df.tail()

## check county names against NYTimes data

We eventually need to merge with the NYTimes data, so let's see how they match with each other:

In [None]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = pd.read_csv(
        response,
        dtype={'fips':'str'}
    )
nyt_df.head()

In [None]:
county_diffs = list(set(nyt_df['fips']) - set(pop_df['fips']))
len(county_diffs)

In [None]:
sorted([str(f) for f in county_diffs])

As expected, the census county data is missing all municipios from [Puerto Rico](https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html) (`fips == 72`) as well as a couple from the Northern Mariana Islands (`fips == 69`) and US Virgin Islands (`fips == 78`) so we need to append that data to `pop_df`.

In [None]:
county_diffs = list(set(nyt_df['county']) - set(dem_df['county']))
len(county_diffs)

In [None]:
county_diffs

In [None]:


# def custom_county_maker(df, using='fips', method='sum', age=False, 
#     state_fips='36', state='New York', state_abbr='NY', county='New York City', 
#     fips=nyc_fips, counties=boroughs, end_fips='36NYC'):
    
#     cols = df.select_dtypes(include='number').columns
#     if 'agegrp' in cols:
#         cols.remove('agegrp')
    
#     loc_dict = {'fips': fips, 'counties': counties}
    
#     to_index = using
#     if age:
#         to_index += ['agegrp']
        
#     def _sub_df(df):
#         return df.loc[df['state']]
        
#     temp_df = source_df.loc[source_df['state']==state]\
#                        .set_index(to_index).loc[loc_dict[using][0], :]
#     ref_df = pop_df.loc[source_df['state']=='New York']\
#                  .set_index('county').loc[loc_dict[using][0], 'tot_pop']
        
#     temp_df = temp_df.select_dtypes(include='number')
    
#     if method == 'sum':
#         temp_df = pd.DataFrame(
#             [np.sum(temp_df)],
#             columns=cols
#         )
#     elif method == 'mean':
#         temp_df = pd.DataFrame(
#             [np.average(temp_df.values, axis=0, weights=ref_df)],
#             columns=cols
#         )
        
#     for c in source_df.select_dtypes(exclude='number').columns:
#         if 'state_fips' in c.lower():
#             temp_df[c] = state_fips
#         elif source_df[c].map(len).mean() == 2:
#             temp_df[c] = state_abbr
#         elif 'county' in c.lower():
#             temp_df[c] = county
#         elif 'state' in c.lower():
#             temp_df[c] = state
#         elif 'fips' in c.lower():
#             temp_df[c] = end_fips
            
#     return temp_df

The NYTimes dataset is missing diacritical marks in their names. While it would be easier to replace diacritical marks with their "standard" character counterparts, we will preserve them in our final dataframe in the interest of cultural accuracy. This will be handled when we merge `pop_df` with `nyt_df` in the other notebook.