In [2]:
import sys
import os
import requests
import pandas as pd
import re

In [3]:
STATES = ['Delaware', 'District of Columbia', 'Maryland', 'North Carolina', 'Virginia']

In [4]:
def get_census_data(
    endpoint="2021/acs/acs5",
    get_vars="NAME,B19013_001E",
    for_clause="place:*",
    in_clause="state:*"
):
    """
    Retrieves data from the U.S. Census Bureau API and returns it as a Pandas DataFrame.

    Parameters:
        endpoint (str): The specific API endpoint (e.g., "2021/acs/acs5").
        get_vars (str): Comma-separated list of variables to retrieve.
        for_clause (str): The 'for' clause specifying the geography.
        in_clause (str): The 'in' clause specifying nested geographies (optional).

    Returns:
        pd.DataFrame: DataFrame containing the requested Census data.
    """
    base_url = "https://api.census.gov/data/"
    url = base_url + endpoint
    params = {
        "get": get_vars,
        "for": for_clause
    }
    if in_clause:
        params["in"] = in_clause

    response = requests.get(url, params=params)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the JSON response. The first row is the header.
    data = response.json()
    df = pd.DataFrame(data[1:], columns=data[0])
    
    return df

In [5]:

def get_acs_data():
    # Retrieve data for income, population, age, and education.
    df = get_census_data(
        endpoint="2021/acs/acs5",
        get_vars=(
            "NAME,"
            "B19013_001E,"  # Median Household Income
            "B19301_001E,"  # Per Capita Income
            "B01003_001E,"  # Total Population
            "B01002_001E,"  # Median Age
            "B15003_001E,"  # Total Population 25+ (Education Universe)
            "B15003_022E"   # Number with at least a Bachelor's Degree
        ),
        for_clause="place:*",
        in_clause="state:*"
    )

    # List of columns to convert to numeric.
    numeric_cols = [
        'B19013_001E',  # Median Household Income
        'B19301_001E',  # Per Capita Income
        'B01003_001E',  # Total Population
        'B01002_001E',  # Median Age
        'B15003_001E',  # Total Population 25+
        'B15003_022E'   # Bachelor's Degree Count
    ]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Rename columns for clarity.
    # The API returns additional columns for geography; here we rename them as well.
    df = df.rename(columns={
        "B19013_001E": "median_household_income",
        "B19301_001E": "median_percapita_income",
        "B01003_001E": "population",
        "B01002_001E": "median_age",
        "B15003_001E": "total_25_plus_education",
        "B15003_022E": "bachelor_degree",
        # Rename the geography codes (which come from the 'for' and 'in' clauses).
        "state": "state_code",
        "place": "place_code"
    })
    df.columns = df.columns.str.lower()

    # Create a unique GEOID by combining state and place codes.
    # ACS state codes are 2 digits; place codes are usually 5 digits.
    df["geoid"] = df["state_code"].str.zfill(2) + df["place_code"].str.zfill(5)

    # Process the NAME column to derive human-friendly "town" and "state_name".
    # The NAME field often contains both the place name and the state abbreviation,
    # so we split by comma: the first part is the town, and the last is the state name.
    df["town"] = df['name'].apply(lambda x: x.split(",")[0].strip())
    df["state_name"] = df['name'].apply(lambda x: x.split(",")[-1].strip())

    # Define the order of columns to keep.
    keep_cols = [
        'geoid', 'name', 'state_code', 'place_code', 'state_name', 'town',
        'median_household_income', 'median_percapita_income',
        'population', 'median_age', 'total_25_plus_education', 'bachelor_degree'
    ]
    
    return df[keep_cols]

In [6]:

def clean_town_names(df):
    """
    Cleans the 'town' column in the DataFrame by:
      1. Removing any of the following keywords (case-insensitive) as whole words:
         'CDP', 'city and borough', 'city', 'town', 'borough',
         'municipality', 'village', 'urban county', 'metro township'
      2. Removing any content in parentheses.
      3. Replacing multiple whitespace with a single space and trimming leading/trailing spaces.
      
    This ensures that for example, "Georgetown town" becomes "Georgetown" rather than "George".
    
    Parameters:
        df (pd.DataFrame): DataFrame containing a 'town' column.
        
    Returns:
        pd.DataFrame: DataFrame with a cleaned 'town' column.
    """
    remove_from_names = [
        'CDP', 'city and borough', 'city', 'town', 'borough',
        'municipality', 'village', 'urban county', 'metro township'
    ]
    
    # Build regex pattern with word boundaries so that only whole words are matched.
    # We use re.escape to ensure any special characters in the keywords are escaped.
    pattern = r'\b(?:' + '|'.join(map(re.escape, remove_from_names)) + r')\b'
    
    # Remove the keywords (case-insensitive) that appear as whole words.
    df['town'] = df['town'].str.replace(pattern, '', regex=True)
    
    # Remove any content inside parentheses (including the parentheses)
    df['town'] = df['town'].str.replace(r'\(.*?\)', '', regex=True)
    
    # Replace multiple whitespace with a single space.
    df['town'] = df['town'].str.replace(r'\s+', ' ', regex=True)
    
    # Trim leading and trailing whitespace.
    df['town'] = df['town'].str.strip()
    
    return df

In [7]:
def filter_acs(df, states):
    """
    Filters the given DataFrame to include only rows where the 'state_name' column matches the provided list of states.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing a column named 'state_name'.
    states (list): A list of state names to filter the DataFrame.

    Returns:
    pd.DataFrame: A new DataFrame containing only the rows where 'state_name' is in the provided list.
    """
    new_df = df.copy()
    new_df = new_df.loc[new_df['state_name'].isin(states)]
    return new_df


In [13]:
def main():
    """
    Retrieves, cleans, filters, and saves ACS data.

    This function:
    1. Fetches ACS data using `get_acs_data()`.
    2. Cleans town names using `clean_town_names()`.
    3. Filters the data for the specified states using `filter_acs()`.
    4. Saves the processed DataFrame as a CSV file in the 'data' directory.

    The script also ensures that the 'src' directory is added to the system path.
    
    Returns:
    None
    """
    sys.path.append(os.path.abspath('.'))

    # Process the ACS data
    df_acs = (
        get_acs_data()
        .pipe(clean_town_names)
        .pipe(filter_acs, states=STATES)
    )

    # Define the file path and save to CSV
    filepath = os.path.join('..', 'data', 'acs_raw.csv')
    df_acs.to_csv(filepath, index=False)

In [14]:
main()