# Census Data EDA

In [1]:
import numpy as np
import pandas as pd

from urllib.request import urlopen
from requests_html import HTMLSession
import json

import itertools

import re

from time import time
from datetime import datetime, timedelta

from shapely.geometry import Polygon

# import census data

Population data taken from [census.gov](https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/).

Since 2020 Census data have not been released yet, we will use 2019 population estimates.

Looking at the [data dictionary](https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf), we only want the names and FIPS columns (eg. `STATE`, `STNAME`) and `POPESTIMATE2019`.

In [7]:
with urlopen('https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv') as response:
    pop_df = pd.read_csv(
        response, 
        usecols=['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'POPESTIMATE2019'], 
        encoding='latin-1',        # to avoid unicode error
        dtype={'STATE':'str',
               'COUNTY':'str'}
    )
pop_df.head()

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019
0,1,0,Alabama,Alabama,4903185
1,1,1,Alabama,Autauga County,55869
2,1,3,Alabama,Baldwin County,223234
3,1,5,Alabama,Barbour County,24686
4,1,7,Alabama,Bibb County,22394


In [3]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3193 entries, 0 to 3192
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   STATE            3193 non-null   object
 1   COUNTY           3193 non-null   object
 2   STNAME           3193 non-null   object
 3   CTYNAME          3193 non-null   object
 4   POPESTIMATE2019  3193 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 124.9+ KB


Notice that county names provided by the US census contain descriptive terms, such as 'County', whereas the NYTimes data does not.

In [5]:
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    return dft

In [8]:
# remove state population data
pop_df = pop_df[pop_df['COUNTY'] != '000']

# rename columns to better-match nytimes data (and personal preference)
pop_df.rename(
    columns={
        'STATE':'statefips',
        'COUNTY':'countyfips',
        'STNAME':'state',
        'CTYNAME':'county',
        'POPESTIMATE2019':'population'
    }, inplace=True
)

# create county fips column
pop_df['fips'] = pop_df['statefips'] + pop_df['countyfips']
pop_df.drop(columns=['statefips', 'countyfips'], inplace=True)

# remove descriptive terms from county names
county_terms = ['County', 'Parish', 'Municipality']
for term in county_terms:
    pop_df['county'] = pop_df['county'].str.replace(' ' + term, '')
    
pop_df = optimize(pop_df)
pop_df.head()

Unnamed: 0,state,county,population,fips
1,Alabama,Autauga,55869,1001
2,Alabama,Baldwin,223234,1003
3,Alabama,Barbour,24686,1005
4,Alabama,Bibb,22394,1007
5,Alabama,Blount,57826,1009


In [9]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 1 to 3192
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   state       3142 non-null   category
 1   county      3142 non-null   object  
 2   population  3142 non-null   int32   
 3   fips        3142 non-null   object  
dtypes: category(1), int32(1), object(2)
memory usage: 91.9+ KB


# check county names against NYTimes data

We eventually need to merge `nyt_df` and `pop_df`, so let's see how they match with each other:

In [10]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv') as response:
    nyt_df = optimize(pd.read_csv(
        response,
        dtype={'fips':'str'}
    ))
nyt_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0


In [12]:
county_diffs = list(set(nyt_df['county']) - set(pop_df['county']))
len(county_diffs)

80

In [13]:
county_diffs[:5]

['Aguas Buenas', 'Guanica', 'San German', 'Arecibo', 'San Sebastian']

The census county data is missing all municipios from [Puerto Rico](https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html), so we need to append that data to `pop_df`.

# import Puerto Rico census data

In [14]:
pr_df = pd.read_excel(
    'https://www2.census.gov/programs-surveys/popest/tables/2010-2019/municipios/totals/prm-est2019-annres.xlsx', 
    header=3)
pr_df = pr_df[['Unnamed: 0', 2019]]
pr_df.rename(
    columns={
        'Unnamed: 0':'county',
        2019:'population'
    }, inplace=True
)
pr_df = pr_df[~pr_df['population'].isna()]
pr_df['population'] = pr_df['population'].astype('int')
pr_df.head()

Unnamed: 0,county,population
0,Puerto Rico,3193694
1,".Adjuntas Municipio, Puerto Rico",17363
2,".Aguada Municipio, Puerto Rico",36694
3,".Aguadilla Municipio, Puerto Rico",50265
4,".Aguas Buenas Municipio, Puerto Rico",24814


In [15]:
pr_df['county'] = [s[0] if len(s) > 0 else s for s in pr_df['county'].str.findall("\.([\w\s]+) Municipio\,.+")]
pr_df = pr_df.iloc[1:]          # removing the territory as a whole from the table
pr_df.head()

Unnamed: 0,county,population
1,Adjuntas,17363
2,Aguada,36694
3,Aguadilla,50265
4,Aguas Buenas,24814
5,Aibonito,22108


We also need to add `fips` codes for all of the municipios.

## import Puerto Rico `fips`

In [16]:
sess = HTMLSession()
res = sess.get('https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county')
table = res.html.find('table.wikitable > tbody > tr')
# puerto rico is fips 72
pr_fips = [[tr.find('td')[1].text, tr.find('td')[0].text] for tr in table[1:] if tr.find('td')[0].text[:2] == '72']
pr_fips_df = pd.DataFrame(pr_fips)
pr_fips_df.rename(
    columns={
        0:'county',
        1:'fips'
    }, inplace=True
)
pr_fips_df.head()

Unnamed: 0,county,fips
0,Adjuntas Municipality,72001
1,Aguada Municipality,72003
2,Aguadilla Municipality,72005
3,Aguas Buenas Municipality,72007
4,Aibonito Municipality,72009


In [17]:
pr_fips_df['county'] = [s[0] if len(s) > 0 else s for s in pr_fips_df['county'].str.findall("([\w\s]+) Municipality")]
pr_fips_df.head()

Unnamed: 0,county,fips
0,Adjuntas,72001
1,Aguada,72003
2,Aguadilla,72005
3,Aguas Buenas,72007
4,Aibonito,72009


In [19]:
len(list(set(pr_fips_df['county']) - set(pr_df['county'])))

0

In [20]:
pr_df = pr_df.merge(pr_fips_df, on='county')
pr_df['state'] = 'Puerto Rico'
pr_df.head()

Unnamed: 0,county,population,fips,state
0,Adjuntas,17363,72001,Puerto Rico
1,Aguada,36694,72003,Puerto Rico
2,Aguadilla,50265,72005,Puerto Rico
3,Aguas Buenas,24814,72007,Puerto Rico
4,Aibonito,22108,72009,Puerto Rico


In [21]:
pop_df = pop_df.append(pr_df, ignore_index=True)
pop_df.tail()

Unnamed: 0,state,county,population,fips
3215,Puerto Rico,Vega Baja,50023,72145
3216,Puerto Rico,Vieques,8386,72147
3217,Puerto Rico,Villalba,21372,72149
3218,Puerto Rico,Yabucoa,32282,72151
3219,Puerto Rico,Yauco,33575,72153


In [22]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   state       3220 non-null   object
 1   county      3220 non-null   object
 2   population  3220 non-null   int32 
 3   fips        3220 non-null   object
dtypes: int32(1), object(3)
memory usage: 88.2+ KB


# check county names against NYTimes data (again)

In [24]:
county_diffs = list(set(nyt_df['county']) - set(pop_df['county']))
len(county_diffs)

19

In [27]:
county_diffs

['Guanica',
 'San German',
 'San Sebastian',
 'Rincon',
 'Unknown',
 'New York City',
 'Juana Diaz',
 'Loiza',
 'Mayaguez',
 'Catano',
 'Kansas City',
 'Manati',
 'Anasco',
 'Bayamon',
 'Canovanas',
 'Penuelas',
 'Comerio',
 'Joplin',
 'Las Marias']

The NYTimes dataset is missing diacritical marks in their names. In the interest of cultural accuracy, we will preserve them in our final dataframe; this will be handled when we merge `pop_df` with `nyt_df` in the other notebook.

Since the NYTimes dataset treats `New York City`, `Kansas City`, and `Joplin` as their own entities, we need to add them to `pop_df`. We need to add the population data for these three cities. Additional information taken from [census.gov quickfacts]('https://www.census.gov/quickfacts').

We'll use `'nyc'`, `'kc'`, and `'jm'` as our `fips` for these three cities.

In [28]:
pop_df_2 = pd.DataFrame(
    [['New York',
      'New York City',
      8_336_817,
      'nyc'],
     ['Missouri',
      'Kansas City',
      495_327 + 152_960,
      'kc'],
     ['Missouri',
      'Joplin',
      50_925,
      'jm']]
    , columns=pop_df.columns)
pop_df_2

Unnamed: 0,state,county,population,fips
0,New York,New York City,8336817,nyc
1,Missouri,Kansas City,648287,kc
2,Missouri,Joplin,50925,jm


In [29]:
pop_df = optimize(pop_df.append(pop_df_2, ignore_index=True))
pop_df[pop_df['fips'] == 'nyc']

Unnamed: 0,state,county,population,fips
3220,New York,New York City,8336817,nyc


In [30]:
pop_df.to_csv('data/pop_df.csv', index=False)

# put in nytimes notebook

In [26]:
nyt_df = nyt_df.merge(pop_df, on='fips', suffixes=('_x','')).drop(['county_x', 'state_x'], axis=1)
nyt_df[nyt_df['state'] == 'Puerto Rico']

Unnamed: 0,date,fips,cases,deaths,state,county,population
314598,2020-05-05,72033,12,0,Puerto Rico,Cataño,23121
314599,2020-05-06,72033,12,0,Puerto Rico,Cataño,23121
314600,2020-05-07,72033,12,0,Puerto Rico,Cataño,23121
314601,2020-05-08,72033,12,0,Puerto Rico,Cataño,23121
314602,2020-05-09,72033,12,0,Puerto Rico,Cataño,23121


In [None]:
nyt_df = nyt_df[nyt_df['county'] != 'Unknown']
list(set(nyt_df['county']) - set(pop_df['county']))

We need to add the population data for these three cities. Additional information taken from [census.gov quickfacts]('https://www.census.gov/quickfacts').

We'll use `'nyc'`, `'kc'`, and `'jm'` as our `fips` for these three cities.