In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import janitor
import re

In [2]:
iija_df = pd.read_excel("iija.xlsx").clean_names()
iija_df = iija_df.rename(columns={"state_teritory_or_tribal_nation":"state", "total_billions_":"total_funding"})
iija_df['state'] = iija_df['state'].str.title()

In [3]:
iija_df

Unnamed: 0,state,total_funding
0,Alabama,3.0
1,Alaska,3.7
2,American Samoa,0.0686
3,Arizona,3.5
4,Arkansas,2.8
5,California,18.4
6,Colorado,3.2
7,Connecticut,2.5
8,Deleware,0.792
9,District Of Columbia,1.1


https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population


In [6]:
import requests
from io import StringIO

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population"

# Send request
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
response = requests.get(url, headers=headers)
response.raise_for_status()

# Read the first table from the page
population_df = pd.read_html(StringIO(response.text))[0]

# Flatten multi-level columns and normalize names
population_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in population_df.columns]
population_df.columns = [re.sub(r'\[.*?\]', '', col.lower().replace(' ', '_').replace('\n', '_')).strip('_') for col in population_df.columns]

# Keep only the first two columns and rename
population_df = population_df.iloc[:56, :2]
population_df.columns = ['state', 'population']

# Clean 'state' column and convert 'population' to integer and match scale with IIJA
population_df['state'] = population_df['state'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
population_df['population'] = population_df['population'].astype(int) / 1e9

# df.to_csv("us_population_by_state_clean.csv", index=False)

In [7]:
population_df

Unnamed: 0,state,population
0,California,0.039431
1,Texas,0.031291
2,Florida,0.023372
3,New York,0.019867
4,Pennsylvania,0.013079
5,Illinois,0.01271
6,Ohio,0.011883
7,Georgia,0.011181
8,North Carolina,0.011046
9,Michigan,0.01014


In [8]:
population_df

Unnamed: 0,state,population
0,California,0.039431
1,Texas,0.031291
2,Florida,0.023372
3,New York,0.019867
4,Pennsylvania,0.013079
5,Illinois,0.01271
6,Ohio,0.011883
7,Georgia,0.011181
8,North Carolina,0.011046
9,Michigan,0.01014


In [None]:
from rapidfuzz import process, fuzz

# Function to get best fuzzy match AND score
def fuzzy_match_with_score(name, choices, scorer=fuzz.token_sort_ratio):
    match = process.extractOne(name, choices, scorer=scorer)
    if match:
        return match[0], match[1]  # (best match, score)
    return None, None

# Apply fuzzy matching
df_states = population_df['state'].tolist()
iija_df[['matched_state', 'match_score']] = iija_df['state'].apply(
    lambda x: pd.Series(fuzzy_match_with_score(x, df_states))
)

# Add a flag for low confidence
iija_df['low_confidence'] = iija_df['match_score'] < 80

# Merge with df
merged_df = iija_df.merge(population_df, left_on='matched_state', right_on='state', how='left')

# Clean up columns
merged_df = merged_df.drop(columns=['state_y']).rename(columns={'state_x': 'state'})

# Final view
merged_df[['state', 'total_funding', 'matched_state', 'match_score', 'low_confidence', 'population']].sort_values('match_score').head()


Unnamed: 0,state,total_funding,matched_state,match_score,low_confidence,population
48,Tribal Communities,3.0,Louisiana,44.444444,True,0.004598
8,Deleware,0.792,Delaware,87.5,False,0.001052
49,Us Virgin Islands,0.1483,U.S. Virgin Islands,88.888889,False,0.000105
9,District Of Columbia,1.1,District of Columbia,95.0,False,0.000702
0,Alabama,3.0,Alabama,100.0,False,0.005158


In [73]:
merged_filter_df = merged_df[merged_df['low_confidence']  == False]

In [74]:
state_df = merged_filter_df[['matched_state', 'total_funding', 'population']]
state_df = state_df.rename(columns={'matched_state':'state'})
state_df

Unnamed: 0,state,total_funding,population
0,Alabama,3.0,0.005158
1,Alaska,3.7,0.00074
2,American Samoa,0.0686,4.8e-05
3,Arizona,3.5,0.007582
4,Arkansas,2.8,0.003088
5,California,18.4,0.039431
6,Colorado,3.2,0.005957
7,Connecticut,2.5,0.003675
8,Delaware,0.792,0.001052
9,District of Columbia,1.1,0.000702


https://github.com/tonmcg/US_County_Level_Election_Results_08-24

https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-24/refs/heads/master/2020_US_County_Level_Presidential_Results.csv

In [75]:
election_results_df = pd.read_csv("https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-24/refs/heads/master/2020_US_County_Level_Presidential_Results.csv")

In [76]:
election_results_df

Unnamed: 0,state_name,county_fips,county_name,votes_gop,votes_dem,total_votes,diff,per_gop,per_dem,per_point_diff
0,Alabama,1001,Autauga County,19838,7503,27770,12335,0.714368,0.270184,0.444184
1,Alabama,1003,Baldwin County,83544,24578,109679,58966,0.761714,0.224090,0.537623
2,Alabama,1005,Barbour County,5622,4816,10518,806,0.534512,0.457882,0.076631
3,Alabama,1007,Bibb County,7525,1986,9595,5539,0.784263,0.206983,0.577280
4,Alabama,1009,Blount County,24711,2640,27588,22071,0.895716,0.095694,0.800022
...,...,...,...,...,...,...,...,...,...,...
3147,Wyoming,56037,Sweetwater County,12229,3823,16603,8406,0.736554,0.230260,0.506294
3148,Wyoming,56039,Teton County,4341,9848,14677,-5507,0.295769,0.670982,-0.375213
3149,Wyoming,56041,Uinta County,7496,1591,9402,5905,0.797277,0.169219,0.628058
3150,Wyoming,56043,Washakie County,3245,651,4012,2594,0.808824,0.162263,0.646560


In [77]:
election_df = election_results_df[['state_name', 'county_name', 'votes_gop', 'votes_dem']]

In [78]:
election_df = election_df.groupby('state_name').sum(numeric_only=True)
election_df

Unnamed: 0_level_0,votes_gop,votes_dem
state_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,1441168,849648
Alaska,189892,153405
Arizona,1661686,1672143
Arkansas,760647,423932
California,6005961,11109764
Colorado,1364607,1804352
Connecticut,715291,1080680
Delaware,200603,296268
District of Columbia,18586,317323
Florida,5668731,5297045


In [79]:
analysis_df = state_df.merge(election_df, left_on='state', right_on='state_name', how='left')
analysis_df

Unnamed: 0,state,total_funding,population,votes_gop,votes_dem
0,Alabama,3.0,0.005158,1441168.0,849648.0
1,Alaska,3.7,0.00074,189892.0,153405.0
2,American Samoa,0.0686,4.8e-05,,
3,Arizona,3.5,0.007582,1661686.0,1672143.0
4,Arkansas,2.8,0.003088,760647.0,423932.0
5,California,18.4,0.039431,6005961.0,11109764.0
6,Colorado,3.2,0.005957,1364607.0,1804352.0
7,Connecticut,2.5,0.003675,715291.0,1080680.0
8,Delaware,0.792,0.001052,200603.0,296268.0
9,District of Columbia,1.1,0.000702,18586.0,317323.0


In [80]:
analysis_df[['votes_gop', 'votes_dem']] = analysis_df[['votes_gop', 'votes_dem']] / 1e9

In [81]:
analysis_df.describe()

Unnamed: 0,total_funding,population,votes_gop,votes_dem
count,56.0,56.0,51.0,51.0
mean,3.447007,0.006137,0.001455,0.001593
std,3.323185,0.007457,0.001413,0.001916
min,0.0686,4.5e-05,1.9e-05,7.3e-05
25%,1.3,0.001408,0.000474,0.000399
50%,2.7,0.003885,0.00102,0.000856
75%,3.925,0.007316,0.001791,0.002376
max,18.4,0.039431,0.006006,0.01111
