Explore population data for integration

In [1]:
import pandas as pd

df = pd.read_excel("https://www2.census.gov/programs-surveys/popest/tables/2020-2024/counties/totals/co-est2024-pop.xlsx")
df.head()

Unnamed: 0,table with row headers in column A and column headers in rows 3 through 4 (leading dots indicate sub-parts),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Annual Estimates of the Resident Population fo...,,,,,,
1,Geographic Area,"April 1, 2020 Estimates Base",Population Estimate (as of July 1),,,,
2,,,2020,2021.0,2022.0,2023.0,2024.0
3,United States,331515736,331577720,332099760.0,334017321.0,336806231.0,340110988.0
4,".Autauga County, Alabama",58800,58909,59191.0,59736.0,60436.0,61464.0


In [2]:
# Download again, skipping headers and assigning new column names.
df = pd.read_excel("https://www2.census.gov/programs-surveys/popest/tables/2020-2024/counties/totals/co-est2024-pop.xlsx", usecols='A,G',skiprows=4, names=['Region','Pop_Est_July_1_2024'])
df.head()

Unnamed: 0,Region,Pop_Est_July_1_2024
0,".Autauga County, Alabama",61464.0
1,".Baldwin County, Alabama",261608.0
2,".Barbour County, Alabama",24358.0
3,".Bibb County, Alabama",22258.0
4,".Blount County, Alabama",60163.0


In [3]:
df['Region'] = df['Region'].apply(lambda x: str(x).replace('.',''))
df.head()

Unnamed: 0,Region,Pop_Est_July_1_2024
0,"Autauga County, Alabama",61464.0
1,"Baldwin County, Alabama",261608.0
2,"Barbour County, Alabama",24358.0
3,"Bibb County, Alabama",22258.0
4,"Blount County, Alabama",60163.0


In [4]:
df.head(-10)

Unnamed: 0,Region,Pop_Est_July_1_2024
0,"Autauga County, Alabama",61464.0
1,"Baldwin County, Alabama",261608.0
2,"Barbour County, Alabama",24358.0
3,"Bibb County, Alabama",22258.0
4,"Blount County, Alabama",60163.0
...,...,...
3135,"Park County, Wyoming",31082.0
3136,"Platte County, Wyoming",8512.0
3137,"Sheridan County, Wyoming",32978.0
3138,"Sublette County, Wyoming",8965.0


In [5]:
# split "Region" into "County" and "State" by the comman

df[["County", "State"]] = df["Region"].str.split(",", expand=True, n=1)
df.drop("Region", axis=1,inplace=True)
df.head(-10)

Unnamed: 0,Pop_Est_July_1_2024,County,State
0,61464.0,Autauga County,Alabama
1,261608.0,Baldwin County,Alabama
2,24358.0,Barbour County,Alabama
3,22258.0,Bibb County,Alabama
4,60163.0,Blount County,Alabama
...,...,...,...
3135,31082.0,Park County,Wyoming
3136,8512.0,Platte County,Wyoming
3137,32978.0,Sheridan County,Wyoming
3138,8965.0,Sublette County,Wyoming


In [6]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "Virgin Islands, U.S.": "VI",
}
df['State'] = df['State'].str.strip()
df['State'] = df['State'].replace(us_state_to_abbrev)
df


Unnamed: 0,Pop_Est_July_1_2024,County,State
0,61464.0,Autauga County,AL
1,261608.0,Baldwin County,AL
2,24358.0,Barbour County,AL
3,22258.0,Bibb County,AL
4,60163.0,Blount County,AL
...,...,...,...
3145,,Note: The estimates are developed from a base ...,"Vintage 2020 estimates, and 2020 Demographic A..."
3146,,Suggested Citation:,
3147,,Annual Estimates of the Resident Population fo...,"2020 to July 1, 2024 (CO-EST2024-POP)"
3148,,Source: US Census Bureau,Population Division


In [7]:
df.dropna(inplace=True)

In [8]:
none = df[df['Pop_Est_July_1_2024'].isna()]
none


Unnamed: 0,Pop_Est_July_1_2024,County,State


In [9]:
df['Pop_Est_July_1_2024'] = df['Pop_Est_July_1_2024'].astype(int)
df

Unnamed: 0,Pop_Est_July_1_2024,County,State
0,61464,Autauga County,AL
1,261608,Baldwin County,AL
2,24358,Barbour County,AL
3,22258,Bibb County,AL
4,60163,Blount County,AL
...,...,...,...
3139,41273,Sweetwater County,WY
3140,23272,Teton County,WY
3141,20621,Uinta County,WY
3142,7662,Washakie County,WY


In [10]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
scaler = MinMaxScaler()
df['Low_Pop_Score'] = 1 - scaler.fit_transform(np.log10(df[["Pop_Est_July_1_2024"]]))
df['High_Pop_Score'] = scaler.fit_transform(np.log10(df[["Pop_Est_July_1_2024"]]))
df

Unnamed: 0,Pop_Est_July_1_2024,County,State,Low_Pop_Score,High_Pop_Score
0,61464,Autauga County,AL,0.414595,0.585405
1,261608,Baldwin County,AL,0.296091,0.703909
2,24358,Barbour County,AL,0.490324,0.509676
3,22258,Bibb County,AL,0.497701,0.502299
4,60163,Blount County,AL,0.416345,0.583655
...,...,...,...,...,...
3139,41273,Sweetwater County,WY,0.447178,0.552822
3140,23272,Teton County,WY,0.494056,0.505944
3141,20621,Uinta County,WY,0.503951,0.496049
3142,7662,Washakie County,WY,0.584954,0.415046
