In [2]:
import requests
import csv
import os 
import pandas as pd
import re

## US Census Data

### Helper Functions 

In [3]:
from uscensus_functions import uscensus_importcsv

from uscensus_functions import uscensus_modify

```python

# Specify the desired variables and rename them
name_dictionary = {
    # "USCensusVariable": "Desired Name" 
    "S_": "Population",
}

# Specify the year of the data
name_year = "2000"

# Specify the file name for the csv file
name_output_file_name = "name_county_2023.csv"

name_output_file_path, name_specific_variables = uscensus_importcsv(name_dictionary, name_year, name_output_file_name)

uscensus_modify(name_output_file_path, name_specific_variables)

```python

# Specify the folder where the file will be saved
data_folder = "data_uscensus"

# Specify the text file where the US Census API is stored
api_file = "apikey_uscensus.txt"

# Different data series require different api requests 
# https://censusreporter.org/topics/table-codes/
# check api link for specific table 
series_dictionary = {
    "B": "",
    "S": "/subject",
    "DP": "/profile"
}

# Specify the year of the data
year = "2023"


### Demographics

**Dataset Link:** https://data.census.gov/table/ACSDP5Y2023.DP05?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** demographics.csv

| Variable Name          | Explanation                                  | Import Code / Derivation | CSV  | Final Dataset  |
|------------------------|----------------------------------------------|--------------------------|------|--------------|
| median_age            | Median age across the total population.      | DP05_0018E               | [x]  | [x]          |
| males_per_100_females | Number of males per 100 females (Sex Ratio). | DP05_0004E               | [x]  | [x]          |




In [4]:
demographics_dictionary = {
    "DP05_0018E": "median_age",
    "DP05_0004E": "males_per_100_females"
}

demographics_year = "2023"

demographics_output_file_name = "demographics_county_2023.csv"

In [5]:
demographics_output_file_path, demographics_specific_variables = uscensus_importcsv(demographics_dictionary, demographics_year, demographics_output_file_name)

uscensus_modify(demographics_output_file_path, demographics_specific_variables)

Existing file found, removing...
Imported Variables:['median_age', 'males_per_100_females']
Data saved to data_uscensus\demographics_county_2023.csv
data_uscensus\demographics_county_2023.csv has been modified and saved


In [6]:
demographics_df = pd.read_csv(demographics_output_file_path, dtype={"fips_code": str})

demographics_df.head()

Unnamed: 0,fips_code,county_name,state_name,median_age,males_per_100_females
0,1001,Autauga County,Alabama,39.2,93.6
1,1003,Baldwin County,Alabama,43.7,95.7
2,1005,Barbour County,Alabama,40.7,108.9
3,1007,Bibb County,Alabama,41.3,114.5
4,1009,Blount County,Alabama,40.9,102.0


### Education

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S1501?q=county+education&y=2023 

**License:** Public Domain

**Dataset Name (CSV):** education.csv

| Variable Name                | Explanation                                                  | Import Code / Derivation       | CSV  | Final Dataset  |
|------------------------------|--------------------------------------------------------------|--------------------------------|------|--------------|
| pop25                        | Population aged 25 years or older.                           | S1501_C01_006E                 | [x]  | [ ]          |
| pop_25plus_highschool_plus   | Population aged 25 years or older with high school graduate or higher. | S1501_C01_014E      | [x]  | [ ]          |
| pop_25plus_bachelor_plus     | Population aged 25 years or older with bachelor's degree or higher.  | S1501_C01_015E      | [x]  | [ ]          |
| pct_pop_25plus_highschool_plus | Percentage of population aged 25 years or older with high school graduate or higher. | = pop_25plus_highschool_plus / pop25 | [ ]  | [x]          |
| pct_pop_25plus_bachelor_plus | Percentage of population aged 25 years or older with bachelor's degree or higher. | = pop_25plus_bachelor_plus / pop25 | [ ]  | [x]          |


In [7]:
education_dictionary = {
    "S1501_C01_006E": "pop25",
    "S1501_C01_014E": "pop_25plus_highschool_plus",
    "S1501_C01_015E": "pop_25plus_bachelor_plus"
}

education_year = "2023"

education_output_file_name = "education_county_2023.csv"

In [8]:
education_output_file_path, education_specific_variables = uscensus_importcsv(education_dictionary, education_year, education_output_file_name)

uscensus_modify(education_output_file_path, education_specific_variables)

Existing file found, removing...
Imported Variables:['pop25', 'pop_25plus_highschool_plus', 'pop_25plus_bachelor_plus']
Data saved to data_uscensus\education_county_2023.csv
data_uscensus\education_county_2023.csv has been modified and saved


In [9]:
education_df = pd.read_csv(education_output_file_path, dtype={"fips_code": str})

education_df["pct_pop_25plus_highschool_plus"] = education_df["pop_25plus_highschool_plus"] / education_df["pop25"]
education_df["pct_pop_25plus_bachelor_plus"] = education_df["pop_25plus_bachelor_plus"] / education_df["pop25"]

education_df = education_df.drop(["pop25", "pop_25plus_highschool_plus", "pop_25plus_bachelor_plus"], axis=1)

education_df.head()

Unnamed: 0,fips_code,county_name,state_name,pct_pop_25plus_highschool_plus,pct_pop_25plus_bachelor_plus
0,1001,Autauga County,Alabama,0.902789,0.282827
1,1003,Baldwin County,Alabama,0.917314,0.327976
2,1005,Barbour County,Alabama,0.778137,0.114647
3,1007,Bibb County,Alabama,0.803402,0.114682
4,1009,Blount County,Alabama,0.826962,0.15579


### Health Insurance

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S2701?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** healthinsurance.csv

| Variable Name            | Explanation                                              | Import Code / Derivation | CSV  | Final Dataset |
|--------------------------|----------------------------------------------------------|--------------------------|------|--------------|
| pct_pop_health_insured  | Percentage of population (non-institutionalized) with health insurance. | S2701_C03_001E | [x]  | [x]          |

In [10]:
healthinsurance_dictionary = {
    "S2701_C03_001E": "pct_pop_health_insured"
}

healthinsurance_year = "2023"

healthinsurance_output_file_name = "healthinsurance_county_2023.csv"

In [11]:

healthinsurance_output_file_path, healthinsurance_specific_variables = uscensus_importcsv(healthinsurance_dictionary, healthinsurance_year, healthinsurance_output_file_name)

uscensus_modify(healthinsurance_output_file_path, healthinsurance_specific_variables)

Existing file found, removing...
Imported Variables:['pct_pop_health_insured']
Data saved to data_uscensus\healthinsurance_county_2023.csv
data_uscensus\healthinsurance_county_2023.csv has been modified and saved


In [12]:
healthinsurance_df = pd.read_csv(healthinsurance_output_file_path, dtype={"fips_code": str})

healthinsurance_df["pct_pop_health_insured"] = healthinsurance_df["pct_pop_health_insured"] / 100

healthinsurance_df.head()

Unnamed: 0,fips_code,county_name,state_name,pct_pop_health_insured
0,1001,Autauga County,Alabama,0.926
1,1003,Baldwin County,Alabama,0.918
2,1005,Barbour County,Alabama,0.892
3,1007,Bibb County,Alabama,0.917
4,1009,Blount County,Alabama,0.898


### Households

**Dataset Link:** https://data.census.gov/table/ACSDP5Y2023.DP02?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** households.csv

| Variable Name        | Explanation                                        | Import Code / Derivation | CSV  | Final Dataset |
|----------------------|----------------------------------------------------|--------------------------|------|--------------|
| households          | Total number of households.                        | DP02_0001E               | [x]  | [x]          |
| mean_household_size | Average number of people in a household (household size). | DP02_0016E               | [x]  | [x]          |


In [13]:
households_dictionary = {
    "DP02_0001E": "households",
    "DP02_0016E": "mean_household_size"
}

households_year = "2023"

households_output_file_name = "households_county_2023.csv"

In [14]:
households_output_file_path, households_specific_variables = uscensus_importcsv(households_dictionary, households_year, households_output_file_name)

uscensus_modify(households_output_file_path, households_specific_variables)

Existing file found, removing...
Imported Variables:['households', 'mean_household_size']
Data saved to data_uscensus\households_county_2023.csv
data_uscensus\households_county_2023.csv has been modified and saved


In [15]:
households_df = pd.read_csv(households_output_file_path, dtype={"fips_code": str})

households_df.head()

Unnamed: 0,fips_code,county_name,state_name,households,mean_household_size
0,1001,Autauga County,Alabama,22523.0,2.61
1,1003,Baldwin County,Alabama,94642.0,2.5
2,1005,Barbour County,Alabama,9080.0,2.39
3,1007,Bibb County,Alabama,7571.0,2.74
4,1009,Blount County,Alabama,21977.0,2.67


### Housing Cost

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S2503?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** housingcost.csv

| Variable Name                              | Explanation                                              | Import Code / Derivation | CSV  | Final Dataset |
|--------------------------------------------|----------------------------------------------------------|--------------------------|------|--------------|
| median_monthly_housing_cost               | Median monthly housing cost for all occupied housing units. | S2503_C01_024E          | [x]  | [x]          |
| median_monthly_housing_cost_owner_occupied | Median monthly housing cost for all owner-occupied housing units. | S2503_C03_024E          | [x]  | [x]          |
| median_monthly_housing_cost_renter_occupied | Median monthly housing cost for all renter-occupied housing units. | S2503_C05_024E          | [x]  | [x]          |

In [16]:
housingcost_dictionary = {
    "S2503_C01_024E": "median_monthly_housing_cost",
    "S2503_C03_024E": "median_monthly_housing_cost_owner_occupied",
    "S2503_C05_024E": "median_monthly_housing_cost_renter_occupied"
}

housingcost_year = "2023"

housingcost_output_file_name = "housingcost_county_2023.csv"

In [17]:
housingcost_output_file_path, housingcost_specific_variables = uscensus_importcsv(housingcost_dictionary, housingcost_year, housingcost_output_file_name)

uscensus_modify(housingcost_output_file_path, housingcost_specific_variables)

Existing file found, removing...
Imported Variables:['median_monthly_housing_cost', 'median_monthly_housing_cost_owner_occupied', 'median_monthly_housing_cost_renter_occupied']
Data saved to data_uscensus\housingcost_county_2023.csv
data_uscensus\housingcost_county_2023.csv has been modified and saved


In [18]:
housingcost_df = pd.read_csv(housingcost_output_file_path, dtype={"fips_code": str})

housingcost_df.head()

Unnamed: 0,fips_code,county_name,state_name,median_monthly_housing_cost,median_monthly_housing_cost_owner_occupied,median_monthly_housing_cost_renter_occupied
0,1001,Autauga County,Alabama,1048,953,1200
1,1003,Baldwin County,Alabama,1142,1098,1211
2,1005,Barbour County,Alabama,640,632,644
3,1007,Bibb County,Alabama,615,566,802
4,1009,Blount County,Alabama,741,746,743


### Housing Type

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S1101?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** housingtype.csv

| Variable Name                     | Explanation                                        | Import Code / Derivation | CSV  | Final Dataset |
|------------------------------------|--------------------------------------------------|--------------------------|------|--------------|
| pct_households_owner_occupied     | Percentage of households that own the housing they live in.  | S1101_C01_018E | [x]  | [x]          |
| pct_households_renter_occupied    | Percentage of households that rent the housing they live in.  | S1101_C01_019E | [x]  | [x]          |


In [19]:
housingtype_dictionary = {
    "S1101_C01_018E": "pct_households_owner_occupied",
    "S1101_C01_019E": "pct_households_renter_occupied"
}

housingtype_year = "2023"

housingtype_output_file_name = "housingtype_county_2023.csv"

In [20]:
housingtype_output_file_path, housingtype_specific_variables = uscensus_importcsv(housingtype_dictionary, housingtype_year, housingtype_output_file_name)

uscensus_modify(housingtype_output_file_path, housingtype_specific_variables)

Existing file found, removing...
Imported Variables:['pct_households_owner_occupied', 'pct_households_renter_occupied']
Data saved to data_uscensus\housingtype_county_2023.csv
data_uscensus\housingtype_county_2023.csv has been modified and saved


In [21]:
housingtype_df = pd.read_csv(housingtype_output_file_path, dtype={"fips_code": str})

housingtype_df["pct_households_owner_occupied"] = housingtype_df["pct_households_owner_occupied"] / 100
housingtype_df["pct_households_renter_occupied"] = housingtype_df["pct_households_renter_occupied"] / 100

housingtype_df.head()

Unnamed: 0,fips_code,county_name,state_name,pct_households_owner_occupied,pct_households_renter_occupied
0,1001,Autauga County,Alabama,0.749,0.251
1,1003,Baldwin County,Alabama,0.775,0.225
2,1005,Barbour County,Alabama,0.675,0.325
3,1007,Bibb County,Alabama,0.772,0.228
4,1009,Blount County,Alabama,0.795,0.205


### Industry Composition

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S2405?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** industrycomposition.csv

| Variable Name                                               | Explanation                                                                 | Import Code / Derivation                     | CSV  | Final Dataset |
|------------------------------------------------------------|-----------------------------------------------------------------------------|----------------------------------------------|------|--------------|
| employed                                                  | Employed population (aged 16 years or older).                              | S2405_C01_001E                              | [x]  | [ ]          |
| employed_agriculture_forestry_fishing_hunting_mining     | Employed population (aged 16 years or older) in agriculture, forestry, fishing, hunting, or mining. | S2405_C01_002E  | [x]  | [ ]          |
| employed_construction                                     | Employed population (aged 16 years or older) working in construction.      | S2405_C01_003E                              | [x]  | [ ]          |
| employed_manufacturing                                    | Employed population (aged 16 years or older) working in manufacturing.     | S2405_C01_004E                              | [x]  | [ ]          |
| employed_wholesale_trade                                  | Employed population (aged 16 years or older) working in wholesale trade.   | S2405_C01_005E                              | [x]  | [ ]          |
| employed_retail_trade                                     | Employed population (aged 16 years or older) working in retail trade.      | S2405_C01_006E                              | [x]  | [ ]          |
| employed_transportation_warehousing_utilities            | Employed population (aged 16 years or older) in transportation, warehousing, or utilities. | S2405_C01_007E | [x]  | [ ]          |
| employed_information                                     | Employed population (aged 16 years or older) working in the information industry. | S2405_C01_008E | [x]  | [ ]          |
| employed_finance_insurance_real_estate                   | Employed population (aged 16 years or older) in finance, insurance, or real estate. | S2405_C01_009E | [x]  | [ ]          |
| employed_professional_scientific_management_administration | Employed population (aged 16 years or older) in professional, scientific, management, or administrative services. | S2405_C01_010E | [x]  | [ ]          |
| employed_education_healthcare_social_assistance          | Employed population (aged 16 years or older) in educational services, health care, or social assistance. | S2405_C01_011E | [x]  | [ ]          |
| employed_arts_entertainment_recreation_food_accommodation | Employed population (aged 16 years or older) in arts, entertainment, recreation, accommodation, or food services. | S2405_C01_012E | [x]  | [ ]          |
| employed_other_services                                  | Employed population (aged 16 years or older) in other services (excluding public administration). | S2405_C01_013E | [x]  | [ ]          |
| employed_public_administration                           | Employed population (aged 16 years or older) in public administration.     | S2405_C01_014E                              | [x]  | [ ]          |
| pct_employed_agriculture_forestry_fishing_hunting_mining | Percentage of employed population (aged 16 years or older) in agriculture, forestry, fishing, hunting, or mining. | = employed_agriculture_forestry_fishing_hunting_mining / employed | [ ]  | [x]          |
| ...                                                      | ...                                                                         | ...                                          | [ ]  | [ ]          |



In [22]:
industrycomposition_dictionary = {
    "S2405_C01_001E": "employed",
    "S2405_C01_002E": "employed_agriculture_forestry_fishing_hunting_mining",
    "S2405_C01_003E": "employed_construction",
    "S2405_C01_004E": "employed_manufacturing",
    "S2405_C01_005E": "employed_wholesale_trade",
    "S2405_C01_006E": "employed_retail_trade",
    "S2405_C01_007E": "employed_transportation_warehousing_utilities",
    "S2405_C01_008E": "employed_information",
    "S2405_C01_009E": "employed_finance_insurance_real_estate",
    "S2405_C01_010E": "employed_professional_scientific_management_administration",
    "S2405_C01_011E": "employed_education_healthcare_social_assistance",
    "S2405_C01_012E": "employed_arts_entertainment_recreation_food_accommodation",
    "S2405_C01_013E": "employed_other_services",
    "S2405_C01_014E": "employed_public_administration"
}

industrycomposition_year = "2023"

industrycomposition_output_file_name = "industrycomposition_county_2023.csv"

In [23]:
industrycomposition_output_file_path, industrycomposition_specific_variables = uscensus_importcsv(industrycomposition_dictionary, industrycomposition_year, industrycomposition_output_file_name)

uscensus_modify(industrycomposition_output_file_path, industrycomposition_specific_variables)

Existing file found, removing...
Imported Variables:['employed', 'employed_agriculture_forestry_fishing_hunting_mining', 'employed_construction', 'employed_manufacturing', 'employed_wholesale_trade', 'employed_retail_trade', 'employed_transportation_warehousing_utilities', 'employed_information', 'employed_finance_insurance_real_estate', 'employed_professional_scientific_management_administration', 'employed_education_healthcare_social_assistance', 'employed_arts_entertainment_recreation_food_accommodation', 'employed_other_services', 'employed_public_administration']
Data saved to data_uscensus\industrycomposition_county_2023.csv
data_uscensus\industrycomposition_county_2023.csv has been modified and saved


In [24]:
industrycomposition_df = pd.read_csv(industrycomposition_output_file_path, dtype={"fips_code": str})

industrycomposition_df["pct_employed_agriculture_forestry_fishing_hunting_mining"] = industrycomposition_df["employed_agriculture_forestry_fishing_hunting_mining"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_construction"] = industrycomposition_df["employed_construction"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_manufacturing"] = industrycomposition_df["employed_manufacturing"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_wholesale_trade"] = industrycomposition_df["employed_wholesale_trade"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_retail_trade"] = industrycomposition_df["employed_retail_trade"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_transportation_warehousing_utilities"] = industrycomposition_df["employed_transportation_warehousing_utilities"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_information"] = industrycomposition_df["employed_information"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_finance_insurance_real_estate"] = industrycomposition_df["employed_finance_insurance_real_estate"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_professional_scientific_management_administration"] = industrycomposition_df["employed_professional_scientific_management_administration"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_education_healthcare_social_assistance"] = industrycomposition_df["employed_education_healthcare_social_assistance"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_arts_entertainment_recreation_food_accommodation"] = industrycomposition_df["employed_arts_entertainment_recreation_food_accommodation"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_other_services"] = industrycomposition_df["employed_other_services"] / industrycomposition_df["employed"]
industrycomposition_df["pct_employed_public_administration"] = industrycomposition_df["employed_public_administration"] / industrycomposition_df["employed"]


industrycomposition_df = industrycomposition_df.drop([
        "employed",
        "employed_agriculture_forestry_fishing_hunting_mining",
        "employed_construction",
        "employed_manufacturing",
        "employed_wholesale_trade",
        "employed_retail_trade",
        "employed_transportation_warehousing_utilities",
        "employed_information",
        "employed_finance_insurance_real_estate",
        "employed_professional_scientific_management_administration",
        "employed_education_healthcare_social_assistance",
        "employed_arts_entertainment_recreation_food_accommodation",
        "employed_other_services",
        "employed_public_administration"
    ], axis=1)

industrycomposition_df.head()

Unnamed: 0,fips_code,county_name,state_name,pct_employed_agriculture_forestry_fishing_hunting_mining,pct_employed_construction,pct_employed_manufacturing,pct_employed_wholesale_trade,pct_employed_retail_trade,pct_employed_transportation_warehousing_utilities,pct_employed_information,pct_employed_finance_insurance_real_estate,pct_employed_professional_scientific_management_administration,pct_employed_education_healthcare_social_assistance,pct_employed_arts_entertainment_recreation_food_accommodation,pct_employed_other_services,pct_employed_public_administration
0,1001,Autauga County,Alabama,0.009855,0.07236,0.116974,0.029262,0.121901,0.056667,0.012925,0.057994,0.09063,0.189826,0.101395,0.036881,0.103328
1,1003,Baldwin County,Alabama,0.017352,0.08194,0.098151,0.019807,0.146172,0.0495,0.008872,0.071352,0.118277,0.192532,0.101072,0.049418,0.045557
2,1005,Barbour County,Alabama,0.041608,0.052945,0.227209,0.005844,0.099696,0.068373,0.005376,0.035414,0.052127,0.213768,0.086138,0.029687,0.081814
3,1007,Bibb County,Alabama,0.026662,0.101552,0.163408,0.029032,0.134969,0.057353,0.008413,0.058656,0.083185,0.200735,0.034246,0.054746,0.047043
4,1009,Blount County,Alabama,0.022232,0.09342,0.163515,0.032976,0.132258,0.07744,0.009416,0.047902,0.078026,0.195515,0.059115,0.052669,0.035516


### Internet Access

**Dataset Link:** https://data.census.gov/table/ACSDT5Y2023.B28011?q=county

**License:** Public Domain

**Dataset Name (CSV):** internetaccess.csv

| Variable Name                 | Explanation                                         | Import Code / Derivation       | CSV  | Final Dataset  |
|--------------------------------|-----------------------------------------------------|--------------------------------|------|--------------|
| households                     | Total number of households.                        | B28011_001E                   | [x]  | [ ]          |
| households_with_internet        | Number of households with an internet subscription. | B28011_002E                   | [x]  | [ ]          |
| pct_households_with_internet    | Percentage of households with an internet subscription. | = households_with_internet / households | [ ]  | [x]          |


In [25]:
internetaccess_dictionary = {
    "B28011_001E": "households",
    "B28011_002E": "households_with_internet"
}

internetaccess_year = "2023"

internetaccess_output_file_name = "internetaccess_county_2023.csv"

In [26]:
internetaccess_output_file_path, internetaccess_specific_variables = uscensus_importcsv(internetaccess_dictionary, internetaccess_year, internetaccess_output_file_name)

uscensus_modify(internetaccess_output_file_path, internetaccess_specific_variables)

Existing file found, removing...
Imported Variables:['households', 'households_with_internet']
Data saved to data_uscensus\internetaccess_county_2023.csv
data_uscensus\internetaccess_county_2023.csv has been modified and saved


In [27]:
internetaccess_df = pd.read_csv(internetaccess_output_file_path, dtype={"fips_code": str})

internetaccess_df["pct_households_with_internet"] = internetaccess_df["households_with_internet"] / internetaccess_df["households"]

internetaccess_df = internetaccess_df.drop([
        "households",
        "households_with_internet"
    ], axis=1)

internetaccess_df.head()

Unnamed: 0,fips_code,county_name,state_name,pct_households_with_internet
0,1001,Autauga County,Alabama,0.90947
1,1003,Baldwin County,Alabama,0.898988
2,1005,Barbour County,Alabama,0.723018
3,1007,Bibb County,Alabama,0.813631
4,1009,Blount County,Alabama,0.84411


### Labor Force Participation Rate

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S2301?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** laborforceparticipation.csv

| Variable Name                      | Explanation                                                       | Import Code / Derivation | CSV  | Final Dataset |
|-------------------------------------|-------------------------------------------------------------------|--------------------------|------|--------------|
| labor_force_participation_rate      | Labor force participation rate (=total labor force / total working-age population). | S2301_C02_001E | [x]  | [x]          |


In [28]:
laborforceparticipation_dictionary = {
    "S2301_C02_001E": "labor_force_participation_rate"
}

laborforceparticipation_year = "2023"

laborforceparticipation_output_file_name = "laborforceparticipation_county_2023.csv"

In [29]:
laborforceparticipation_output_file_path, laborforceparticipation_specific_variables = uscensus_importcsv(laborforceparticipation_dictionary, laborforceparticipation_year, laborforceparticipation_output_file_name)

uscensus_modify(laborforceparticipation_output_file_path, laborforceparticipation_specific_variables)

Existing file found, removing...
Imported Variables:['labor_force_participation_rate']
Data saved to data_uscensus\laborforceparticipation_county_2023.csv
data_uscensus\laborforceparticipation_county_2023.csv has been modified and saved


In [30]:
laborforceparticipation_df = pd.read_csv(laborforceparticipation_output_file_path, dtype={"fips_code": str})

laborforceparticipation_df["labor_force_participation_rate"] = laborforceparticipation_df["labor_force_participation_rate"] / 100

laborforceparticipation_df.head()

Unnamed: 0,fips_code,county_name,state_name,labor_force_participation_rate
0,1001,Autauga County,Alabama,0.59
1,1003,Baldwin County,Alabama,0.583
2,1005,Barbour County,Alabama,0.449
3,1007,Bibb County,Alabama,0.516
4,1009,Blount County,Alabama,0.574


### Limited English Speaking

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S1602?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** limitedenglish.csv

| Variable Name                            | Explanation                                                                                              | Import Code / Derivation | CSV  | Final Dataset |
|------------------------------------------|----------------------------------------------------------------------------------------------------------|--------------------------|------|--------------|
| pct_households_limited_english_proficiency | Percentage of the population (aged 5 years and over) who speak a language other than English at home and speak English less than "very well." | S1602_C04_001E | [x]  | [x]          |


In [31]:
limitedenglish_dictionary = {
    "S1602_C04_001E": "pct_households_limited_english_proficiency"
}

limitedenglish_year = "2023"

limitedenglish_output_file_name = "limitedenglish_county_2023.csv"

In [32]:

limitedenglish_output_file_path, limitedenglish_specific_variables = uscensus_importcsv(limitedenglish_dictionary, limitedenglish_year, limitedenglish_output_file_name)
uscensus_modify(limitedenglish_output_file_path, limitedenglish_specific_variables)

Existing file found, removing...
Imported Variables:['pct_households_limited_english_proficiency']
Data saved to data_uscensus\limitedenglish_county_2023.csv
data_uscensus\limitedenglish_county_2023.csv has been modified and saved


In [33]:
limitedenglish_df = pd.read_csv(limitedenglish_output_file_path, dtype={"fips_code": str})

limitedenglish_df["pct_households_limited_english_proficiency"] = limitedenglish_df["pct_households_limited_english_proficiency"] / 100

limitedenglish_df.head()

Unnamed: 0,fips_code,county_name,state_name,pct_households_limited_english_proficiency
0,1001,Autauga County,Alabama,0.005
1,1003,Baldwin County,Alabama,0.008
2,1005,Barbour County,Alabama,0.025
3,1007,Bibb County,Alabama,0.007
4,1009,Blount County,Alabama,0.018


### Mean Household Income

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S1901?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** meanincome.csv

| Variable Name           | Explanation                                                                 | Import Code / Derivation | CSV  | Final Dataset |
|-------------------------|---------------------------------------------------------------------------|--------------------------|------|--------------|
| mean_household_income  | Mean household income in the past 12 months (in 2023 Inflation-Adjusted Dollars). | S1901_C01_013E | [x]  | [x]          |


In [34]:
meanincome_dictionary = {
    "S1901_C01_013E": "mean_household_income"
}

meanincome_year = "2023"

meanincome_output_file_name = "meanincome_county_2023.csv"

In [35]:
meanincome_output_file_path, meanincome_specific_variables = uscensus_importcsv(meanincome_dictionary, meanincome_year, meanincome_output_file_name)

uscensus_modify(meanincome_output_file_path, meanincome_specific_variables)

Existing file found, removing...
Imported Variables:['mean_household_income']
Data saved to data_uscensus\meanincome_county_2023.csv
data_uscensus\meanincome_county_2023.csv has been modified and saved


In [36]:
meanincome_df = pd.read_csv(meanincome_output_file_path, dtype={"fips_code": str})

meanincome_df.head()

Unnamed: 0,fips_code,county_name,state_name,mean_household_income
0,1001,Autauga County,Alabama,93367
1,1003,Baldwin County,Alabama,100105
2,1005,Barbour County,Alabama,64745
3,1007,Bibb County,Alabama,67735
4,1009,Blount County,Alabama,79203


### Median Household Income

**Dataset Link:** https://data.census.gov/table/ACSST5Y2023.S1903?q=county&y=2023

**License:** Public Domain

**Dataset Name (CSV):** medianincome.csv

| Variable Name            | Explanation                                                                 | Import Code / Derivation | CSV  | Final Dataset |
|--------------------------|---------------------------------------------------------------------------|--------------------------|------|--------------|
| median_household_income  | Median household income in the past 12 months (in 2023 Inflation-Adjusted Dollars). | S1903_C03_001E | [x]  | [x]          |


In [37]:
medianincome_dictionary = {
    "S1903_C03_001E": "median_household_income"
}

medianincome_year = "2023"

medianincome_output_file_name = "medianincome_county_2023.csv"

In [38]:
medianincome_output_file_path, medianincome_specific_variables = uscensus_importcsv(medianincome_dictionary, medianincome_year, medianincome_output_file_name)

uscensus_modify(medianincome_output_file_path, medianincome_specific_variables)

Existing file found, removing...
Imported Variables:['median_household_income']
Data saved to data_uscensus\medianincome_county_2023.csv
data_uscensus\medianincome_county_2023.csv has been modified and saved


In [39]:
medianincome_df = pd.read_csv(medianincome_output_file_path, dtype={"fips_code": str})

medianincome_df.head()

Unnamed: 0,fips_code,county_name,state_name,median_household_income
0,1001,Autauga County,Alabama,69841
1,1003,Baldwin County,Alabama,75019
2,1005,Barbour County,Alabama,44290
3,1007,Bibb County,Alabama,51215
4,1009,Blount County,Alabama,61096


## Other Sources

### Helper Functions

In [40]:
from othersources_functions import othersources_transformfips

from othersources_functions import othersources_mapdf

In [41]:
# Specify the input folder (= folder where original csv files are saved)
input_folder = "./data_othersources/originaldatasets"

# Specify the output folder (= folder where modified csv files will be saved)
output_folder = "./data_othersources"

```python

# Specify the file name (input / output)
name_file_name = "name_county_2023.csv"

# Read df from file
name_df = pd.read_csv(f"{input_folder}/{name_file_name}")

# Specify the desired variables and rename them
name_column_dictionary = {
    # "DatasetVariable": "Desired Name" 
    "...": "...",
}

# Transform the fips code column of the dataframe
name_df = othersources_transformfips(name_df, "fips_code column")

# Map the df to the county_df and keep only specified variables
name_df = othersources_mapdf(name_df, name_column_dictionary, county_df)

# Save df as a file 
name_df.to_csv(f"{output_folder}/{name_file_name}", index=False)


### County Template

Create a county template from an existing uscensus dataset to map the datasets from other sources.

**File Name:** county_template.csv

In [42]:
county_df = pd.read_csv(demographics_output_file_path, dtype={"fips_code": str})

county_df = county_df[["fips_code", "county_name", "state_name"]]

county_df.to_csv("./data_othersources/county_template.csv", index = False)

### Geo Data

In [43]:
geo_file_name = "geo_county_2023.csv"

geo_df = pd.read_csv(f"{input_folder}/{geo_file_name}", sep=";")

geo_df = geo_df.drop(["Geo Shape"], axis=1)

geo_df[["lat", "lng"]] = geo_df["Geo Point"].str.split(", " , expand=True)

geo_df["lat"] = geo_df["lat"].astype(float)
geo_df["lng"] = geo_df["lng"].astype(float)

geo_column_dictionary = {
    "lat": "latitude",
    "lng": "longitude"
}

geo_df = othersources_transformfips(geo_df, "Official Code County")

geo_df = othersources_mapdf(geo_df, geo_column_dictionary, county_df)

geo_df.to_csv(f"{output_folder}/{geo_file_name}", index=False)

All FIPS Codes are now length 5.
Number of missing values in each column:
fips_code      0
county_name    0
state_name     0
latitude       0
longitude      0
dtype: int64


### Population

In [44]:
population_file_name = "population_county_2023.csv"

population_df = pd.read_csv(f"{input_folder}/{population_file_name}", encoding="latin1")

population_df = population_df.pivot(index="FIPStxt", columns="Attribute", values="Value")

population_df = population_df.reset_index()

population_column_dictionary = {
    "POP_ESTIMATE_2023": "population",
    "R_BIRTH_2023": "birth_rate",
    "R_DEATH_2023": "death_rate",
}

population_df = othersources_transformfips(population_df, "FIPStxt")

population_df = othersources_mapdf(population_df, population_column_dictionary, county_df)

population_df.to_csv(f"{output_folder}/{population_file_name}", index=False)

All FIPS Codes are now length 5.
Number of missing values in each column:
fips_code      0
county_name    0
state_name     0
population     0
birth_rate     0
death_rate     0
dtype: int64


### Poverty

In [45]:
poverty_file_name = "poverty_county_2023.csv"

poverty_df = pd.read_csv(f"{input_folder}/{poverty_file_name}", encoding="latin1")

poverty_df = poverty_df[poverty_df["Attribute"] == "PCTPOVALL_2023"]

poverty_df.reset_index(drop=True, inplace=True)

poverty_column_dictionary = {
    "Value": "poverty_rate"
}

poverty_df = othersources_transformfips(poverty_df, "FIPS_Code")

poverty_df = othersources_mapdf(poverty_df, poverty_column_dictionary, county_df)

poverty_df["poverty_rate"] = poverty_df["poverty_rate"] / 100

poverty_df.to_csv(f"{output_folder}/{poverty_file_name}", index=False)

All FIPS Codes are now length 5.
Some counties were missing in the dataframe, 1 NaN values were introduced.
Missing FIPS Codes:
['15005']
Number of missing values in each column:
fips_code       0
county_name     0
state_name      0
poverty_rate    1
dtype: int64


### Unemployment

In [46]:
unemployment_file_name = "unemployment_county_2023.csv"

unemployment_df = pd.read_csv(f"{input_folder}/{unemployment_file_name}", encoding="latin1")

unemployment_df = unemployment_df[unemployment_df["Attribute"] == "Unemployment_rate_2023"]

unemployment_df.reset_index(drop=True, inplace=True)

unemployment_column_dictionary = {
    "Value": "unemployment_rate"
}

unemployment_df = othersources_transformfips(unemployment_df, "FIPS_Code")

unemployment_df = othersources_mapdf(unemployment_df, unemployment_column_dictionary, county_df)

unemployment_df["unemployment_rate"] = unemployment_df["unemployment_rate"] / 100

unemployment_df.to_csv(f"{output_folder}/{unemployment_file_name}", index=False)

All FIPS Codes are now length 5.
Some counties were missing in the dataframe, 10 NaN values were introduced.
Missing FIPS Codes:
['09110', '09120', '09130', '09140', '09150', '09160', '09170', '09180', '09190', '15005']
Number of missing values in each column:
fips_code             0
county_name           0
state_name            0
unemployment_rate    10
dtype: int64


### Rural Urban

In [47]:
ruralurban_file_name = "ruralurban_county_2023.csv"

ruralurban_df = pd.read_csv(f"{input_folder}/{ruralurban_file_name}", encoding="latin1")

ruralurban_df = ruralurban_df[ruralurban_df["Attribute"] == "RUCC_2023"]

ruralurban_df.reset_index(drop=True, inplace=True)

ruralurban_column_dictionary = {
    "Value": "rucc"
}

ruralurban_df = othersources_transformfips(ruralurban_df, "FIPS")

ruralurban_df = othersources_mapdf(ruralurban_df, ruralurban_column_dictionary, county_df)

# RUCC Codes Classification 
# https://www.ers.usda.gov/data-products/rural-urban-continuum-codes/documentation
ruralurban_mapping = {1: "Metro", 2: "Metro", 3: "Metro",
           4: "Nonmetro", 5: "Nonmetro", 6: "Nonmetro",
           7: "Nonmetro", 8: "Nonmetro", 9: "Nonmetro"}

ruralurban_df["rucc"] = ruralurban_df["rucc"].astype(int)

ruralurban_df["area_classification"] = ruralurban_df["rucc"].map(ruralurban_mapping)

ruralurban_df.to_csv(f"{output_folder}/{ruralurban_file_name}", index=False)


All FIPS Codes are now length 5.
Number of missing values in each column:
fips_code      0
county_name    0
state_name     0
rucc           0
dtype: int64


### Zillow House Value Index (ZHVI)

In [48]:
zillowhousevalue_file_name = "zillowhousevalue_county_2023.csv"

zillowhousevalue_df = pd.read_csv(f"{input_folder}/{zillowhousevalue_file_name}")

zillowhousevalue_df["fips_code"] = (
    zillowhousevalue_df["StateCodeFIPS"].astype(str).str.zfill(2) +
    zillowhousevalue_df["MunicipalCodeFIPS"].astype(str).str.zfill(3)
)

zillowhousevalue_months = ["2023-01-31","2023-02-28","2023-03-31","2023-04-30","2023-05-31","2023-06-30","2023-07-31","2023-08-31","2023-09-30","2023-10-31","2023-11-30","2023-12-31"]

zillowhousevalue_df["mean_zhvi_home_value"] = zillowhousevalue_df[zillowhousevalue_months].mean(axis=1)

zillowhousevalue_column_dictionary = {
    "mean_zhvi_home_value": "mean_zhvi_home_value"
}

zillowhousevalue_df = othersources_transformfips(zillowhousevalue_df, "fips_code")

zillowhousevalue_df = othersources_mapdf(zillowhousevalue_df, zillowhousevalue_column_dictionary, county_df)

zillowhousevalue_df.to_csv(f"{output_folder}/{zillowhousevalue_file_name}", index=False)


All FIPS Codes are now length 5.
Some counties were missing in the dataframe, 80 NaN values were introduced.
Missing FIPS Codes:
['02013', '02016', '02050', '02060', '02063', '02066', '02068', '02070', '02105', '02158', '02164', '02180', '02188', '02198', '02230', '02282', '02290', '05099', '08009', '08057', '09110', '09120', '09130', '09140', '09150', '09160', '09170', '09180', '09190', '15005', '16023', '28055', '28119', '29087', '29227', '31005', '31009', '31075', '32009', '35019', '35021', '35033', '38007', '38023', '38037', '38047', '38051', '38069', '38079', '38085', '38087', '38091', '46003', '46017', '46041', '46043', '46049', '46053', '46055', '46061', '46069', '46075', '46085', '46089', '46095', '46102', '46121', '46137', '48033', '48137', '48229', '48261', '48263', '48269', '48301', '48311', '48359', '48393', '48413', '49031']
Number of missing values in each column:
fips_code                0
county_name              0
state_name               0
mean_zhvi_home_value    82
d

# Combine Datasets

In [49]:
from functools import reduce

In [52]:
county_dfs = [
    geo_df,
    population_df,
    poverty_df,
    ruralurban_df,
    unemployment_df,
    zillowhousevalue_df,
    demographics_df,
    education_df,
    healthinsurance_df,
    households_df,
    housingcost_df,
    housingtype_df,
    industrycomposition_df,
    internetaccess_df,
    laborforceparticipation_df,
    limitedenglish_df,
    meanincome_df,
    medianincome_df
]

# Merge all DataFrames
county_df = reduce(lambda left, right: pd.merge(left, right, on=["fips_code", "county_name", "state_name"], how="inner"), county_dfs)

county_df.to_csv("./data/merged_county_data.csv", index=False)