In [181]:
import pandas as pd

pd.set_option('display.max_columns', None)

In [182]:
WORKING_DIRECTORY = '../data/raw/census/'

## Incident counts, weighted by population

We'll start by downloading the total 2020 population counts for US incorporated places and minor encorporated places: April 1 2020 to July 1 2021 from [the Census.gov website](https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html). [Direct link.](https://www2.census.gov/programs-surveys/popest/datasets/2020-2021/cities/totals/sub-est2021_all.csv)

In [183]:
df = pd.read_csv(WORKING_DIRECTORY + 'sub-est2021_all.csv',
                         sep=',',
                         encoding='ISO-8859-1')
df.sample()

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021
73297,157,48,217,3588,0,0,1,A,Aquilla city,Texas,103,103,104


In [184]:
df = df.loc[:, ['PLACE', 'NAME', 'STNAME', 'ESTIMATESBASE2020']]

In [185]:
df['NAME'] = df.NAME \
                .str \
                .upper()

The state data is spelled out, so we'll split and convert to state codes.

In [186]:
state_dict = {
    "ALABAMA": "AL",
    "ALASKA": "AK",
    "ARIZONA": "AZ",
    "ARKANSAS": "AR",
    "CALIFORNIA": "CA",
    "COLORADO": "CO",
    "CONNECTICUT": "CT",
    "DELAWARE": "DE",
    "FLORIDA": "FL",
    "GEORGIA": "GA",
    "HAWAII": "HI",
    "IDAHO": "ID",
    "ILLINOIS": "IL",
    "INDIANA": "IN",
    "IOWA": "IA",
    "KANSAS": "KS",
    "KENTUCKY": "KY",
    "LOUISIANA": "LA",
    "MAINE": "ME",
    "MARYLAND": "MD",
    "MASSACHUSETTS": "MA",
    "MICHIGAN": "MI",
    "MINNESOTA": "MN",
    "MISSISSIPPI": "MS",
    "MISSOURI": "MO",
    "MONTANA": "MT",
    "NEBRASKA": "NE",
    "NEVADA": "NV",
    "NEW HAMPSHIRE": "NH",
    "NEW JERSEY": "NJ",
    "NEW MEXICO": "NM",
    "NEW YORK": "NY",
    "NORTH CAROLINA": "NC",
    "NORTH DAKOTA": "ND",
    "OHIO": "OH",
    "OKLAHOMA": "OK",
    "OREGON": "OR",
    "PENNSYLVANIA": "PA",
    "RHODE ISLAND": "RI",
    "SOUTH CAROLINA": "SC",
    "SOUTH DAKOTA": "SD",
    "TENNESSEE": "TN",
    "TEXAS": "TX",
    "UTAH": "UT",
    "VERMONT": "VT",
    "VIRGINIA": "VA",
    "WASHINGTON": "WA",
    "WEST VIRGINIA": "WV",
    "WISCONSIN": "WI",
    "WYOMING": "WY"
}


In [187]:
df['STNAME'] = df.STNAME \
                .str \
                .upper() \
                .map(state_dict)

Calculate null percentage per column.

In [188]:
100 * df.isnull().sum() / df.shape[0]

PLACE                0.000000
NAME                 0.000000
STNAME               0.004913
ESTIMATESBASE2020    0.000000
dtype: float64

We'll drop the small percentage of nulls for state.

In [189]:
df = df[df.STNAME.notna()]

In [190]:
state_names = state_dict.keys()
state_codes = state_dict.values()

name_mask = df['NAME'].isin(state_names)
code_mask = df['STNAME'].isin(state_codes)

df = df[~name_mask | ~code_mask]

In [191]:
df = df[~df['NAME'].str.endswith('COUNTY')]

We'll drop the partial municipalities for this analysis.

In [192]:
df = df[~df['NAME'].str.endswith('(PT.)')]

In [193]:
mask = (df.NAME.str.contains('CITY CITY')) & (df.ESTIMATESBASE2020 > 10000)
df[mask] \
    .loc[:, ['NAME', 'STNAME']] \
    .sample(n=3, random_state=1)

Unnamed: 0,NAME,STNAME
3753,NATIONAL CITY CITY,CA
8422,GRANITE CITY CITY,IL
39670,UNIVERSITY CITY CITY,MO


In [194]:
print(
    'CITY CITY:', len(df[df.NAME.str.contains('CITY CITY')]),
    '\nVILLAGE VILLAGE:', len(df[df.NAME.str.contains('VILLAGE VILLAGE')]),
    '\nTOWN TOWN:', len(df[df.NAME.str.contains('TOWN TOWN')]),
    '\nBOROUGH BOROUGH:', len(df[df.NAME.str.contains('BOROUGH BOROUGH')]),
    '\nTOWNSHIP TOWNSHIP:', len(df[df.NAME.str.contains('TOWNSHIP TOWNSHIP')])
)

CITY CITY: 694 
VILLAGE VILLAGE: 19 
TOWN TOWN: 342 
BOROUGH BOROUGH: 0 
TOWNSHIP TOWNSHIP: 0


These duplicate names are different than our other datasets, so we'll remove them.

In [195]:
replace_name = lambda x: x \
                        .replace('CITY CITY', 'CITY') \
                        .replace('TOWN TOWN', 'TOWN') \
                        .replace('VILLAGE VILLAGE', 'VILLAGE')
df['NAME'] = df['NAME'].apply(replace_name)

Verify we removed these names correctly.

In [196]:
print(
    'CITY CITY:', len(df[df.NAME.str.contains('CITY CITY')]),
    '\nVILLAGE VILLAGE:', len(df[df.NAME.str.contains('VILLAGE VILLAGE')]),
    '\nTOWN TOWN:', len(df[df.NAME.str.contains('TOWN TOWN')]),
    '\nBOROUGH BOROUGH:', len(df[df.NAME.str.contains('BOROUGH BOROUGH')]),
    '\nTOWNSHIP TOWNSHIP:', len(df[df.NAME.str.contains('TOWNSHIP TOWNSHIP')])
)

CITY CITY: 0 
VILLAGE VILLAGE: 0 
TOWN TOWN: 0 
BOROUGH BOROUGH: 0 
TOWNSHIP TOWNSHIP: 0


For compatibility with our other datasets and for efficient joining, create a column that combines city and state.

In [197]:
df['CITYSTATE'] = df['NAME'].str.upper() + ',' + df['STNAME'].str.upper()
df.sample(random_state=2)

Unnamed: 0,PLACE,NAME,STNAME,ESTIMATESBASE2020,CITYSTATE
78719,0,BRANDON VILLAGE,WI,885,"BRANDON VILLAGE,WI"


Remove any cities with a population of zero or less.

In [198]:
df = df[df.ESTIMATESBASE2020 > 0]

Remove duplicate rows.

In [199]:
df = df[~df.duplicated()]

Rename the population column for clarity and drop unnecessary columns before exporting.

In [200]:
df = df \
    .rename(columns={'ESTIMATESBASE2020': 'POPULATION'}) \
    .drop(labels=['NAME', 'STNAME', 'PLACE'], axis=1)

Export cleaned data to csv.

In [180]:
OUT_DIRECTORY = '../data/processed/'
df.to_csv(
    OUT_DIRECTORY + 'populations_clean.csv', 
    sep=',', 
    index=False
)