In [1]:
import requests
import csv
import os 
import pandas as pd

ModuleNotFoundError: No module named 'requests'

In [2]:
column_mappings = {
    "NAME": "Geographic Area Name",
    
    # Specify the desired variables and rename them
    # "USCensusVariable": "Desired Name" 
    "S1501_C01_006E": "Population 25 years and over",
    "S1501_C01_014E": "Population 25 years and over!Cumulative!High school graduate or higher",
    "S1501_C01_015E": "Population 25 years and over!Cumulative!Bachelors degree or higher"
}

# Specify the year of the data
year = "2023"

# Specify the folder where the file will be saved
data_folder = "datatest"

# Specify the file name for the csv file
output_file_name = "education_county_2023.csv"

# Specify the text file where the US Census API is stored
api_file = "apikey_uscensus.txt"

In [None]:
# Import API key from api_file
with open(api_file, "r") as file:
    api_key = file.read()

# Construct the variable string from the dictionary keys
variables_str = ",".join(column_mappings.keys())

# Construct the URL with the specified variables
url = f"https://api.census.gov/data/{year}/acs/acs5/subject?get={variables_str}&for=county:*&key={api_key}"

# Check if data_folder exists, if not create the folder
if not os.path.exists(data_folder):
    print(f"{data_folder} does not exist, creating...")
    os.makedirs(data_folder)

# Define the output file path
output_file_path = os.path.join(data_folder, output_file_name)

# Check if file already exists, if yes delete
if os.path.exists(output_file_path):
    print("Existing file found, removing...")
    os.remove(output_file_path)

# Make the request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON data
    data = response.json()
    
    # Headers in the original data response
    original_headers = data[0]
    
    # Reorder headers based on the dictionary order and add "state" and "county" fields
    reordered_headers = ["state", "county"] + list(column_mappings.keys())
    
    # Map to descriptive names, adding "State" and "County" for the extra columns
    csv_headers = [column_mappings.get(col, col.capitalize()) for col in reordered_headers]
    
    # Write data to CSV with reordered headers
    with open(output_file_path, "w", newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(csv_headers)  # Write header row with descriptive names
        
        # Reorder each row based on the specified order and write to CSV
        for row in data[1:]:
            reordered_row = [row[original_headers.index(col)] for col in reordered_headers]
            writer.writerow(reordered_row)
    
    print(f"Data saved to {output_file_path}")
else:
    print(f"Request failed with status code {response.status_code}")

Existing file found, removing...
Data saved to datatest\education_county_2023.csv


In [5]:
# Load the CSV file into a pandas DataFrame
# Latin Encoding (Puerto Rico County Names with special characters)
# State / County as string to keep leading 0 
df = pd.read_csv(f"{output_file_path}", encoding="latin-1", dtype={"State": str, "County": str})

# 1. Merge the StateCode and CountyCode to create the FIPS Code column
df["FIPS Code"] = df['State'].astype(str).str.zfill(2) + df['County'].astype(str).str.zfill(3)

# 2. Split the "Geographic Area Name" into "County Name" and "State Name"
df[["County Name", "State Name"]] = df["Geographic Area Name"].str.split(', ', expand=True)

# 3. Specify columns to keep
# Start with additional columns to keep
additional_columns = ["FIPS Code", "County Name", "State Name"]
# Add the imported columns specified in the dictionary, except the first one ("NAME")
imported_columns = list(column_mappings.values())[1:]
# Combine both lists
columns_to_keep = additional_columns + imported_columns
# Check if all specified columns are in the df
for col in columns_to_keep: 
    if col not in df.columns:
        raise Exception(f"{col} not in df, check again")
# Keep all the desired columns
df = df[columns_to_keep]

# 4. Remove all counties in Puerto Rico 
df = df[df["State Name"] != "Puerto Rico"]

# 5. Check if df contains 3144 counties 
if df.shape[0] != 3144: 
    raise Exception(f"{df.shape[0]} instead of 3144 counties in df, check again")

# 6. Overwrite the modified DataFrame back to the original file
df.to_csv(f"{output_file_path}", index=False)
print(f"{output_file_path} has been modified and saved")

df.head()

datatest\education_county_2023.csv has been modified and saved


Unnamed: 0,FIPS Code,County Name,State Name,Population 25 years and over,Population 25 years and over!Cumulative!High school graduate or higher,Population 25 years and over!Cumulative!Bachelors degree or higher
0,1001,Autauga County,Alabama,40767,36804,11530
1,1003,Baldwin County,Alabama,171988,157767,56408
2,1005,Barbour County,Alabama,17628,13717,2021
3,1007,Bibb County,Alabama,15931,12799,1827
4,1009,Blount County,Alabama,40991,33898,6386
