In [1]:
import requests
import csv
import os 
import pandas as pd
import re

In [14]:
# Specify the folder where the file will be saved
data_folder = "datatest"

# Specify the text file where the US Census API is stored
api_file = "apikey_uscensus.txt"

# Different data series require different api requests 
# https://censusreporter.org/topics/table-codes/
# check api link for specific table 
series_dictionary = {
    "B": "",
    "S": "/subject",
    "DP": "/profile"
}

In [15]:
def checkseries(variables):

    series = None    

    for var in variables:

        # extract letters until first number
        match = re.match(r"([A-Za-z]+)", var)
        if match: 
            beginningstring = match.group(1)
        else:
            raise ValueError(f"Variable {var} does not start with letters, cant detect series.")

        if series is None: 
            series = beginningstring
        else: 
            if beginningstring != series: 
                raise Exception("Import variables must be of the same series.")

    print(f"All variables belong to {series} series, API link will be adjusted accordingly.")

    return series


In [11]:

def uscensus_importcsv(column_dictionary, year, output_file_name):

    # Import API key from api_file
    with open(api_file, "r") as file:
        api_key = file.read()

    # Extract all specified variables as list from the dictionary
    specific_variables = list(column_dictionary.keys())

    # Add NAME as additional variable which contains county and state name  
    all_variables = ["NAME"] + specific_variables

    # Format variables as string for the link
    link_variables = ",".join(all_variables)
    print(link_variables)

    # Use predefined function to check the series
    series = checkseries(specific_variables)

    dataseries = series_dictionary.get(series, None)

    if dataseries is None: 
        raise Exception(f"Series not recognized, please check if {series} is included in series_dictionary.")

    # Construct the URL with the specified variables
    url = f"https://api.census.gov/data/{year}/acs/acs5{dataseries}?get={link_variables}&for=county:*&key={api_key}"
    print(url)

    #
    ### Check files
    #

    # Check if data_folder exists, if not create the folder
    if not os.path.exists(data_folder):
        print(f"{data_folder} does not exist, creating...")
        os.makedirs(data_folder)

    # Define the output file path
    output_file_path = os.path.join(data_folder, output_file_name)

    # Check if file already exists, if yes delete
    if os.path.exists(output_file_path):
        print("Existing file found, removing...")
        os.remove(output_file_path)

    #
    ### Make request to api
    #

    # Make the request
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON data
        data = response.json()
        
        # Headers in the original data response
        original_headers = data[0]
        print(original_headers)
        
        # Reorder the headers 
        reordered_headers = ["state", "county"] + all_variables

        # If header (column name) is in the dictionary, the variable should be renamed
        # If header is not in the dictionary the variable should be capitalized (first letter uppercase, remaining one's lowercase)
        descriptive_headers = []
        for header in reordered_headers:
            if header in column_dictionary:
                    descriptive_headers.append(column_dictionary[header])
            else:
                descriptive_headers.append(header.capitalize())

        print(descriptive_headers)

        imported_variables = []
        for header in descriptive_headers:
            if header in column_dictionary.values():
                imported_variables.append(header)
        print(f"Imported Variables:{imported_variables}")

        # Write data to CSV with reordered and renamed headers
        with open(output_file_path, "w", newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(descriptive_headers)  # Write header row with descriptive names
            
            # Reorder each row based on the specified order and write to CSV
            for row in data[1:]:
                reordered_row = [row[original_headers.index(col)] for col in reordered_headers]
                writer.writerow(reordered_row)
        
        print(f"Data saved to {output_file_path}")
    else:
        print(f"Request failed with status code {response.status_code}")

    return output_file_path, imported_variables

In [16]:
# Universal modify for csv

def uscensus_modify(output_file_path, specific_variables):
    # Load the CSV file into a pandas DataFrame
    # Latin Encoding (Puerto Rico County Names with special characters)
    # State / County as string to keep leading 0 
    df = pd.read_csv(f"{output_file_path}", encoding="latin-1", dtype={"State": str, "County": str})

    # 1. Merge the "State" Code and "County" Code to create the "FIPS Code" column
    df["FIPS Code"] = df["State"].astype(str).str.zfill(2) + df["County"].astype(str).str.zfill(3)

    # 2. Split the Name into "County Name" and "State Name"
    df[["County Name", "State Name"]] = df["Name"].str.split(', ', expand=True)

    # 3. Specify columns to keep
    # Start with additional columns to keep
    additional_columns = ["FIPS Code", "County Name", "State Name"]
    # Add the imported columns specified in the dictionary, except the first one ("NAME")
    imported_columns = specific_variables
    # Combine both lists
    columns_to_keep = additional_columns + imported_columns
    # Check if all specified columns are in the df
    for col in columns_to_keep: 
        if col not in df.columns:
            raise Exception(f"{col} not in df, check again")
    # Keep all the desired columns
    df = df[columns_to_keep]

    # 4. Remove all counties in Puerto Rico 
    df = df[df["State Name"] != "Puerto Rico"]

    # 5. Check if df contains 3144 counties 
    if df.shape[0] != 3144: 
        raise Exception(f"{df.shape[0]} instead of 3144 counties in df, check again")

    # 6. Overwrite the modified DataFrame back to the original file
    df.to_csv(f"{output_file_path}", index=False)
    print(f"{output_file_path} has been modified and saved")

    df.head()

In [17]:
# Specify the desired variables and rename them
education_dictionary = {
    # "USCensusVariable": "Desired Name" 
    "S1501_C01_006E": "Population 25 years and over",
    "S1501_C01_014E": "Population 25 years and over!Cumulative!High school graduate or higher",
    "S1501_C01_015E": "Population 25 years and over!Cumulative!Bachelors degree or higher"
}

# Specify the desired year of the data
education_year = "2023"

# Specify the file name for the csv file
education_output_file_name = "education_county_2023.csv"

In [20]:
education_output_file_path, education_specific_variables = uscensus_importcsv(education_dictionary, education_year, education_output_file_name)

uscensus_modify(education_output_file_path, education_specific_variables)

NAME,S1501_C01_006E,S1501_C01_014E,S1501_C01_015E
All variables belong to S series, API link will be adjusted accordingly.
https://api.census.gov/data/2023/acs/acs5/subject?get=NAME,S1501_C01_006E,S1501_C01_014E,S1501_C01_015E&for=county:*&key=64db02f0ff22a5b790004c6424221aeb9d642921
Existing file found, removing...
['NAME', 'S1501_C01_006E', 'S1501_C01_014E', 'S1501_C01_015E', 'state', 'county']
['State', 'County', 'Name', 'Population 25 years and over', 'Population 25 years and over!Cumulative!High school graduate or higher', 'Population 25 years and over!Cumulative!Bachelors degree or higher']
Imported Variables:['Population 25 years and over', 'Population 25 years and over!Cumulative!High school graduate or higher', 'Population 25 years and over!Cumulative!Bachelors degree or higher']
Data saved to datatest\education_county_2023.csv
datatest\education_county_2023.csv has been modified and saved


In [None]:
# Specify the desired variables and rename them
internetaccess_dictionary = {
    # "USCensusVariable": "Desired Name" 
    "B28011_001E": "Total Households",
    "B28011_002E": "Households!With an Internet Subscription"
}

internetaccess_year = "2023"

# Specify the file name for the csv file
internetaccess_output_file_name = "internetaccess_county_2023.csv"

In [25]:
internetaccess_output_file_path, internetaccess_specific_variables = uscensus_importcsv(internetaccess_dictionary, internetaccess_year, internetaccess_output_file_name)

uscensus_modify(internetaccess_output_file_path, internetaccess_specific_variables)

NAME,B28011_001E,B28011_002E
All variables belong to B series, API link will be adjusted accordingly.
https://api.census.gov/data/2023/acs/acs5?get=NAME,B28011_001E,B28011_002E&for=county:*&key=64db02f0ff22a5b790004c6424221aeb9d642921
['NAME', 'B28011_001E', 'B28011_002E', 'state', 'county']
['State', 'County', 'Name', 'Total Households', 'Households!With an Internet Subscription']
Imported Variables:['Total Households', 'Households!With an Internet Subscription']
Data saved to datatest\internetaccess_county_2023.csv
datatest\internetaccess_county_2023.csv has been modified and saved


In [27]:
# Specify the desired variables and rename them
householdsize_dictionary = {
    # "USCensusVariable": "Desired Name" 
    "DP02_0001E": "Total Households",
    "DP02_0016E": "Households!Average Household Size"
}

# Specify the desired year of the data
householdsize_year = "2023"

# Specify the file name for the csv file
householdsize_output_file_name = "householdsize_county_2023.csv"

In [28]:
householdsize_output_file_path, householdsize_specific_variables = uscensus_importcsv(householdsize_dictionary, householdsize_year, householdsize_output_file_name)

uscensus_modify(householdsize_output_file_path, householdsize_specific_variables)

NAME,DP02_0001E,DP02_0016E
All variables belong to DP series, API link will be adjusted accordingly.
https://api.census.gov/data/2023/acs/acs5/profile?get=NAME,DP02_0001E,DP02_0016E&for=county:*&key=64db02f0ff22a5b790004c6424221aeb9d642921
['NAME', 'DP02_0001E', 'DP02_0016E', 'state', 'county']
['State', 'County', 'Name', 'Total Households', 'Households!Average Household Size']
Imported Variables:['Total Households', 'Households!Average Household Size']
Data saved to datatest\householdsize_county_2023.csv
datatest\householdsize_county_2023.csv has been modified and saved


In [29]:
# Specify the desired variables and rename them
meanincome_dictionary = {
    # "USCensusVariable": "Desired Name" 
    "S1901_C01_001E": "Total Households",
    "S1901_C01_013E": "Households!Mean Income"
}

# Specify the desired year of the data
meanincome_year = "2023"

# Specify the file name for the csv file
meanincome_output_file_name = "meanincome_county_2023.csv"

In [30]:
meanincome_output_file_path, meanincome_specific_variables = uscensus_importcsv(meanincome_dictionary, meanincome_year, meanincome_output_file_name)

uscensus_modify(meanincome_output_file_path, meanincome_specific_variables)

NAME,S1901_C01_001E,S1901_C01_013E
All variables belong to S series, API link will be adjusted accordingly.
https://api.census.gov/data/2023/acs/acs5/subject?get=NAME,S1901_C01_001E,S1901_C01_013E&for=county:*&key=64db02f0ff22a5b790004c6424221aeb9d642921
['NAME', 'S1901_C01_001E', 'S1901_C01_013E', 'state', 'county']
['State', 'County', 'Name', 'Total Households', 'Households!Mean Income']
Imported Variables:['Total Households', 'Households!Mean Income']
Data saved to datatest\meanincome_county_2023.csv
datatest\meanincome_county_2023.csv has been modified and saved


In [31]:
# Specify the desired variables and rename them
industrycomposition_dictionary = {
    # "USCensusVariable": "Desired Name" 
    "S2405_C01_001E": "Total 16+ Employed",
    "S2405_C01_002E": "16+ Employed!Agriculture, forestry, fishing and hunting, and mining",
    "S2405_C01_003E": "16+ Employed!Construction",
    "S2405_C01_004E": "16+ Employed!Manufacturing",
    "S2405_C01_005E": "16+ Employed!Wholesale Trade",
    "S2405_C01_006E": "16+ Employed!Retail Trade",
    "S2405_C01_007E": "16+ Employed!Transportation and warehousing, and utilities",
    "S2405_C01_008E": "16+ Employed!Information",
    "S2405_C01_009E": "16+ Employed!Finance and insurance, and real estate and rental and leasing",
    "S2405_C01_010E": "16+ Employed!Professional, scientific, and management, and administrative and waste management services",
    "S2405_C01_011E": "16+ Employed!Educational services, and health care and social assistance",
    "S2405_C01_012E": "16+ Employed!Arts, entertainment, and recreation, and accommodation and food services",
    "S2405_C01_013E": "16+ Employed!Other services, except public administration",
    "S2405_C01_014E": "16+ Employed!Public administration"
}

# Specify the desired year of the data
industrycomposition_year = "2023"

# Specify the file name for the csv file
industrycomposition_output_file_name = "industrycomposition_county_2023.csv"

In [32]:
industrycomposition_output_file_path, industrycomposition_specific_variables = uscensus_importcsv(industrycomposition_dictionary, industrycomposition_year, industrycomposition_output_file_name)

uscensus_modify(industrycomposition_output_file_path, industrycomposition_specific_variables)

NAME,S2405_C01_001E,S2405_C01_002E,S2405_C01_003E,S2405_C01_004E,S2405_C01_005E,S2405_C01_006E,S2405_C01_007E,S2405_C01_008E,S2405_C01_009E,S2405_C01_010E,S2405_C01_011E,S2405_C01_012E,S2405_C01_013E,S2405_C01_014E
All variables belong to S series, API link will be adjusted accordingly.
https://api.census.gov/data/2023/acs/acs5/subject?get=NAME,S2405_C01_001E,S2405_C01_002E,S2405_C01_003E,S2405_C01_004E,S2405_C01_005E,S2405_C01_006E,S2405_C01_007E,S2405_C01_008E,S2405_C01_009E,S2405_C01_010E,S2405_C01_011E,S2405_C01_012E,S2405_C01_013E,S2405_C01_014E&for=county:*&key=64db02f0ff22a5b790004c6424221aeb9d642921
['NAME', 'S2405_C01_001E', 'S2405_C01_002E', 'S2405_C01_003E', 'S2405_C01_004E', 'S2405_C01_005E', 'S2405_C01_006E', 'S2405_C01_007E', 'S2405_C01_008E', 'S2405_C01_009E', 'S2405_C01_010E', 'S2405_C01_011E', 'S2405_C01_012E', 'S2405_C01_013E', 'S2405_C01_014E', 'state', 'county']
['State', 'County', 'Name', 'Total 16+ Employed', '16+ Employed!Agriculture, forestry, fishing and huntin

In [33]:
# Specify the desired variables and rename them
fertilityrate_dictionary = {
    # "USCensusVariable": "Desired Name" 
    "S1301_C01_001E": "Total Women 15 to 50 years",
    "S1301_C02_001E": "Women 15 to 50 years!Women with births in the last 12 months",
}

# Specify the desired year of the data
fertilityrate_year = "2023"

# Specify the file name for the csv file
fertilityrate_output_file_name = "fertilityrate_county_2023.csv"

In [34]:
fertilityrate_output_file_path, fertilityrate_specific_variables = uscensus_importcsv(fertilityrate_dictionary, fertilityrate_year, fertilityrate_output_file_name)

uscensus_modify(fertilityrate_output_file_path, fertilityrate_specific_variables)


NAME,S1301_C01_001E,S1301_C02_001E
All variables belong to S series, API link will be adjusted accordingly.
https://api.census.gov/data/2023/acs/acs5/subject?get=NAME,S1301_C01_001E,S1301_C02_001E&for=county:*&key=64db02f0ff22a5b790004c6424221aeb9d642921
['NAME', 'S1301_C01_001E', 'S1301_C02_001E', 'state', 'county']
['State', 'County', 'Name', 'Total Women 15 to 50 years', 'Women 15 to 50 years!Women with births in the last 12 months']
Imported Variables:['Total Women 15 to 50 years', 'Women 15 to 50 years!Women with births in the last 12 months']
Data saved to datatest\fertilityrate_county_2023.csv
datatest\fertilityrate_county_2023.csv has been modified and saved
