# US Census Data



In [1]:
# Dependencies
import pandas as pd
import requests
from pathlib import Path
from census import Census

# Be sure config.py is in the same folder and contains a valid api key
from config import census_key

In [2]:
# Define function to retrieve US census data for a specific year and return a dataframe
def get_census_data(year):
    census_library = Census(census_key, year=year)
    state_code = '06'
    county_code = '075'
    variables = [
        'NAME',
        'B01003_001E',   # Population
        'B08006_014E',   # Bicycle commuters
    ]
    
    # Retrieve data for all states
    data = census_library.acs5.state_county(variables, state_code, county_code)

    # Convert to dataframe
    df = pd.DataFrame(data)
    df['year'] = year

    # Return the dataframe
    return df

In [3]:
# Retrieve census data for multiple years and merge into one dataframe
first_year = 2012
last_year = 2022

# Code reference: https://blog.finxter.com/how-to-create-a-python-list-of-size-n/
year_df = [None] * (last_year - first_year + 1)
for year in range(first_year, last_year + 1):
    print(f'Getting data for {year}')
    year_df[year - first_year] = get_census_data(year)

# Code reference: https://www.geeksforgeeks.org/merge-two-dataframes-with-same-column-names/
census_df = pd.concat(year_df, axis=0)

Getting data for 2012
Getting data for 2013
Getting data for 2014
Getting data for 2015
Getting data for 2016
Getting data for 2017
Getting data for 2018
Getting data for 2019
Getting data for 2020
Getting data for 2021
Getting data for 2022


In [4]:
print(census_df.dtypes)
census_df

NAME            object
B01003_001E    float64
B08006_014E    float64
state           object
county          object
year             int64
dtype: object


Unnamed: 0,NAME,B01003_001E,B08006_014E,state,county,year
0,"San Francisco County, California",807755.0,14833.0,6,75,2012
0,"San Francisco County, California",817501.0,15631.0,6,75,2013
0,"San Francisco County, California",829072.0,17356.0,6,75,2014
0,"San Francisco County, California",840763.0,18883.0,6,75,2015
0,"San Francisco County, California",850282.0,19822.0,6,75,2016
0,"San Francisco County, California",864263.0,19410.0,6,75,2017
0,"San Francisco County, California",870044.0,20298.0,6,75,2018
0,"San Francisco County, California",874961.0,20268.0,6,75,2019
0,"San Francisco County, California",874784.0,18725.0,6,75,2020
0,"San Francisco County, California",865933.0,16760.0,6,75,2021


In [5]:
census_df.drop(columns=['NAME', 'state', 'county'], inplace=True)
census_df.reset_index(drop=True, inplace=True)
census_df.rename(columns={'B01003_001E': 'population', 'B08006_014E': 'bike_commuters'}, inplace=True)
census_df

Unnamed: 0,population,bike_commuters,year
0,807755.0,14833.0,2012
1,817501.0,15631.0,2013
2,829072.0,17356.0,2014
3,840763.0,18883.0,2015
4,850282.0,19822.0,2016
5,864263.0,19410.0,2017
6,870044.0,20298.0,2018
7,874961.0,20268.0,2019
8,874784.0,18725.0,2020
9,865933.0,16760.0,2021


In [6]:
# Convert columns from float to int
census_df['population'] = census_df['population'].astype(int)
census_df['bike_commuters'] = census_df['bike_commuters'].astype(int)
census_df = census_df[['year', 'population', 'bike_commuters']]
census_df

Unnamed: 0,year,population,bike_commuters
0,2012,807755,14833
1,2013,817501,15631
2,2014,829072,17356
3,2015,840763,18883
4,2016,850282,19822
5,2017,864263,19410
6,2018,870044,20298
7,2019,874961,20268
8,2020,874784,18725
9,2021,865933,16760


In [7]:
# Export dataframe to csv 
census_df.to_csv('resources/census_data.csv', index=False)

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv
