In [18]:
import pandas as pd
import re

pd.set_option('display.max_columns', None)

In [19]:
WORKING_DIRECTORY = '../data/raw/census/'

## Incident counts, weighted by population

We'll start by downloading the total 2020 population counts for US incorporated places and minor encorporated places: April 1 2020 to July 1 2021 from [the Census.gov website](https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html). [Direct link.](https://www2.census.gov/programs-surveys/popest/datasets/2020-2021/cities/totals/sub-est2021_all.csv)

In [20]:
df = pd.read_csv(WORKING_DIRECTORY + 'sub-est2021_all.csv',
                         sep=',',
                         encoding='ISO-8859-1')
df.sample(n=1, random_state=1)

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021
9633,157,17,19,66950,0,0,0,A,St. Joseph village,Illinois,3793,3774,3725


In [21]:
df['NAME'] = df.NAME \
                .str \
                .upper()

The state data is spelled out, so we'll split and convert to state codes.

In [22]:
state_dict = {
    "ALABAMA": "AL",
    "ALASKA": "AK",
    "ARIZONA": "AZ",
    "ARKANSAS": "AR",
    "CALIFORNIA": "CA",
    "COLORADO": "CO",
    "CONNECTICUT": "CT",
    "DELAWARE": "DE",
    "FLORIDA": "FL",
    "GEORGIA": "GA",
    "HAWAII": "HI",
    "IDAHO": "ID",
    "ILLINOIS": "IL",
    "INDIANA": "IN",
    "IOWA": "IA",
    "KANSAS": "KS",
    "KENTUCKY": "KY",
    "LOUISIANA": "LA",
    "MAINE": "ME",
    "MARYLAND": "MD",
    "MASSACHUSETTS": "MA",
    "MICHIGAN": "MI",
    "MINNESOTA": "MN",
    "MISSISSIPPI": "MS",
    "MISSOURI": "MO",
    "MONTANA": "MT",
    "NEBRASKA": "NE",
    "NEVADA": "NV",
    "NEW HAMPSHIRE": "NH",
    "NEW JERSEY": "NJ",
    "NEW MEXICO": "NM",
    "NEW YORK": "NY",
    "NORTH CAROLINA": "NC",
    "NORTH DAKOTA": "ND",
    "OHIO": "OH",
    "OKLAHOMA": "OK",
    "OREGON": "OR",
    "PENNSYLVANIA": "PA",
    "RHODE ISLAND": "RI",
    "SOUTH CAROLINA": "SC",
    "SOUTH DAKOTA": "SD",
    "TENNESSEE": "TN",
    "TEXAS": "TX",
    "UTAH": "UT",
    "VERMONT": "VT",
    "VIRGINIA": "VA",
    "WASHINGTON": "WA",
    "WEST VIRGINIA": "WV",
    "WISCONSIN": "WI",
    "WYOMING": "WY"
}


In [23]:
df['STNAME'] = df \
    .copy(deep=True) \
    .STNAME \
    .str \
    .upper() \
    .map(state_dict)

Calculate null percentage per column.

In [24]:
100 * df.isnull().sum() / df.shape[0]

SUMLEV               0.000000
STATE                0.000000
COUNTY               0.000000
PLACE                0.000000
COUSUB               0.000000
CONCIT               0.000000
PRIMGEO_FLAG         0.000000
FUNCSTAT             0.000000
NAME                 0.000000
STNAME               0.004913
ESTIMATESBASE2020    0.000000
POPESTIMATE2020      0.000000
POPESTIMATE2021      0.000000
dtype: float64

We'll drop the small percentage of nulls for state.

In [25]:
df = df[df.STNAME.notna()]

Filter out the states themselves, which are included in the dataset.

In [26]:
state_names = state_dict.keys()
state_codes = state_dict.values()

name_mask = df['NAME'].isin(state_names)
code_mask = df['STNAME'].isin(state_codes)

df = df[~name_mask | ~code_mask]

In [27]:
df[['NAME', 'PLACE_TYPE']] = df['NAME'].str.rsplit(n=1, expand=True)

In [28]:
df.groupby(by='PLACE_TYPE').size().sort_values(ascending=False)

PLACE_TYPE
CITY            25262
TOWNSHIP        16158
TOWN            13141
VILLAGE         10153
COUNTY           5925
(PT.)            5258
BOROUGH          4878
UT                239
PARISH            122
PLANTATION         32
(BALANCE)          25
AREA               21
RESERVATION        18
MUNICIPALITY       16
GOVERNMENT         15
GRANT               9
1                   8
2                   7
7                   7
5                   7
6                   7
4                   6
PURCHASE            6
3                   5
8                   5
9                   5
GORE                4
LOCATION            4
11                  4
10                  4
CORPORATION         2
12                  2
CDP                 2
BOW                 1
dtype: int64

In [29]:
df = df[
    (df.PLACE_TYPE != '(PT.)') &
    (df.PLACE_TYPE != '(BALANCE)') &
    (df.PLACE_TYPE != 'COUNTY')
]

For compatibility with our other datasets and for efficient joining, create a column that combines city and state.

In [30]:
df['CITYSTATE'] = df['NAME'].str.upper() + ',' + df['STNAME'].str.upper()
df.sample(random_state=2)

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021,PLACE_TYPE,CITYSTATE
79024,61,55,51,0,73400,0,1,A,SHERMAN,WI,287,287,290,TOWN,"SHERMAN,WI"


Remove any cities with a population of zero or less.

In [31]:
df = df[df.ESTIMATESBASE2020 > 0]

Rename the population column.

In [32]:
df = df \
    .rename(columns={'ESTIMATESBASE2020': 'POPULATION'})

The other datasets say NEW YORK CITY instead of NEW YORK, so we'll manually change the value.

Export cleaned data to csv.

In [33]:
OUT_DIRECTORY = '../data/processed/'
df.to_csv(
    OUT_DIRECTORY + 'populations_clean.csv', 
    sep=',', 
    index=False
)