In [6]:
# read in list of charter cities
charter_cities_path = 'data/cacities/charter-cities.txt'

with open(charter_cities_path) as f:
    charter_cities = { line.strip() for line in f }

In [68]:
import re

def parse_geoname(geoname):
    r = dict(state='', name='', geotype='')
    
    # e.g. state data
    if ', ' not in geoname:
        r['state'] = geoname
        return r
    
    rest, r['state'] = geoname.rsplit(', ', 1)

    # missing geotype (e.g. "Princeton, New Jersey")
    #
    # doesn't seem to happen in California
    if ' ' not in rest:
        r['name'] = rest
        return r
    
    clarification = ''
    # e.g. "Bayview CDP (Contra Costa County), California"
    if rest.endswith(')') and '(' in rest:
        rest = rest[:-1]
        rest, clarification = rest.rsplit(' (', 1)
        
    r['name'], r['geotype'] = rest.rsplit(' ', 1)
    
    if clarification:
        r['name'] = f"{r['name']} ({clarification})"
    
    return r

In [74]:
def filter_and_process_place_rows(rows, charter_cities=()):
    for row in rows:
        # skip race/ethnicity breakdowns
        if row['lnnumber'] != '1':
            continue
            
        # don't need this, all rows are the same
        del row['lnnumber']
        del row['lntitle']
            
        if 'California' not in row['geoname']:
            continue
        
        row.update(parse_geoname(row['geoname']))
        
        if row['state'] != 'California':
            continue

        yield row        

In [75]:
# keep CA data only, exclude breakdown by race/ethnicity
import csv

cvap_places_path = 'data/census/CVAP_2015-2019_ACS_csv_files/Place.csv'

f = open(cvap_places_path, newline='', encoding='latin-1')
reader = csv.DictReader(f)

ca_places_list = list(filter_and_process_place_rows(reader))

In [76]:
import pandas as pd

data = pd.DataFrame.from_dict(ca_places_list)

In [80]:
data.columns

Index(['geoname', 'geoid', 'tot_est', 'tot_moe', 'adu_est', 'adu_moe',
       'cit_est', 'cit_moe', 'cvap_est', 'cvap_moe', 'state', 'name',
       'geotype'],
      dtype='object')