In [1]:
import pandas as pd
import numpy as np
from glob import glob
from pathlib import Path

## Load

In [2]:
# Load reference data
ref = pd.read_csv("data/congress_members_with_parties.csv")

# Load and append election files
files = sorted(glob("data/congressional_elections_*.csv"))

dfs = []
for f in files:
    try:
        df = pd.read_csv(f)
        if df.empty:
            continue
        df = df.assign(year=int(Path(f).stem.split("_")[-1]))
        dfs.append(df)
    except pd.errors.EmptyDataError:
        continue

elecs = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
elecs = elecs.drop(columns='status') # unreliable column

## Explore datasets

### Congress members dataset

- Duplicates explained by switches between chambers and gaps between stints
- Most recent title applied retroactively

In [3]:
# Check for duplicates
ref[ref.duplicated(subset=['name'], keep=False)].head(6)

# duplicates happen for members that switch from house to senate or have a gap in congress
# also seems the senator title is applied retroactively

Unnamed: 0,name,first_name,last_name,title,state,house_or_senate,start_year,end_year,party
0,"Abdnor, James",James,Abdnor,Senator,South Dakota,Senate,1981,1987,Republican
1,"Abdnor, James",James,Abdnor,Senator,South Dakota,House,1973,1981,Republican
2,"Abercrombie, Neil",Neil,Abercrombie,Representative,Hawaii,House,1985,1987,Democratic
3,"Abercrombie, Neil",Neil,Abercrombie,Representative,Hawaii,House,1991,2011,Democratic
4,"Abourezk, James",James,Abourezk,Senator,South Dakota,Senate,1973,1979,Democratic
5,"Abourezk, James",James,Abourezk,Senator,South Dakota,House,1971,1973,Democratic


In [4]:
ref_1 = ref.drop_duplicates(subset=['name', 'house_or_senate']) # drop members with gaps in the same chamber
ref_1 = ref_1.sort_values(by=['name', 'start_year']).reset_index(drop=True) # sort
ref_2 = ref_1[ref_1.duplicated(subset=['name'], keep='last')] # get names with stints in both chambers, take their first stint
ref_2.house_or_senate.value_counts()

# there are 174 members with a switch from house to senate and 1 vice versa

house_or_senate
House     174
Senate      1
Name: count, dtype: int64

In [5]:
ref_2[ref_2.house_or_senate=='Senate']

Unnamed: 0,name,first_name,last_name,title,state,house_or_senate,start_year,end_year,party
1999,"Pepper, Claude",Claude,Pepper,Representative,Florida,Senate,1935,1951,Democratic


In [6]:
ref[ref.name=='Pepper, Claude']

# most recent title is applied retroactively

Unnamed: 0,name,first_name,last_name,title,state,house_or_senate,start_year,end_year,party
2074,"Pepper, Claude",Claude,Pepper,Representative,Florida,Senate,1935,1951,Democratic
2075,"Pepper, Claude",Claude,Pepper,Representative,Florida,House,1963,1991,Democratic


### Elections dataset

- Candidate duplicates explained via special elections
- Name duplicates down to name commonalitity; does not hinder name-matching

In [7]:
elecs[elecs.duplicated(subset=['url', 'year'], keep=False)].sort_values(by='name')

# unusual but possible via special elections

Unnamed: 0,name,url,party,year,election
22962,Adam Schiff,/candidate/9489/adam-schiff,Democratic,2024,congressional
22960,Adam Schiff,/candidate/9489/adam-schiff,Democratic,2024,congressional
11349,Albio Sires,/candidate/22510/albio-sires,Democratic,2006,congressional
11868,Albio Sires,/candidate/22510/albio-sires,Democratic,2006,congressional
21701,Alex Padilla,/candidate/59742/alex-padilla,Democratic,2022,congressional
...,...,...,...,...,...
23002,Vince Fong,/candidate/169357/vince-fong,Republican,2024,congressional
12628,Wally Pang,/candidate/69867/wally-pang,Independent,2008,congressional
12623,Wally Pang,/candidate/69867/wally-pang,Independent,2008,congressional
22079,William Henry,/candidate/193495/william-henry,Libertarian,2022,congressional


In [8]:
cand = elecs.drop_duplicates(subset=['url']) # unique candidates
cand[cand.duplicated(subset=['name'], keep=False)].sort_values(by='name')

# these are common names, likely not due to duplicated webpages

Unnamed: 0,name,url,party,year,election
9653,Adam Smith,/candidate/35178/adam-smith,Democratic,2004,congressional
4491,Adam Smith,/candidate/845/adam-smith,Democratic,1996,congressional
22943,Alan Aversa,/candidate/216960/alan-aversa,Green Party,2024,congressional
15184,Alan Aversa,/candidate/143566/alan-aversa,Nominated by Petition,2012,congressional
10017,Angela Lariscy,/candidate/43168/angela-lariscy,,2004,congressional
...,...,...,...,...,...
11629,William Smith,/candidate/65713/william-smith,Independent,2006,congressional
693,William Stephens,/candidate/2008/william-stephens,Write-In,1992,congressional
12645,William Stephens,/candidate/102893/william-stephens,Republican,2008,congressional
7365,William Walker,/candidate/47999/william-walker,Libertarian,2000,congressional


In [9]:
cand.party.value_counts().iloc[:10]

party
Democratic              3804
Republican              3609
Libertarian             1890
Independent             1034
Write-In                 399
Green Party              378
Natural Law Party        275
No Party Affiliation     191
Constitution             152
Reform                   139
Name: count, dtype: int64

## Strategy and assumptions

The matching strategy iterates over records in the reference dataset and can be summarised in the following steps:

1. Find (in the elections dataset) records with the same surname 

## Matching

In [15]:
from main import namematch

mat, unmat = namematch(ref, elecs)

mat.to_csv("matched.csv")
unmat.to_csv("unmatched.csv")

[fail-lastname] Mikva, Abner J.: no candidates with last name 'Mikva'
[fail-lastname] Ribicoff, Abraham A.: no candidates with last name 'Ribicoff'
[fail-lastname] Kazen, Abraham, Jr.: no candidates with last name 'Kazen'
[ambiguous] Smith, Adam: multiple candidates remain after deterministic filters:
      name                         url
Adam Smith /candidate/35178/adam-smith
Adam Smith   /candidate/845/adam-smith
[fail-firstname] Benjamin, Adam, Jr.: no candidates with last name 'Benjamin' and first name 'Adam'
[fail-firstname] Stevenson, Adlai E., III: no candidates with last name 'Stevenson' and first name 'Adlai'
[fail-lastname] Ullman, Al: no candidates with last name 'Ullman'
[fail-lastname] Bible, Alan: no candidates with last name 'Bible'
[fail-lastname] Cranston, Alan: no candidates with last name 'Cranston'
[fail-lastname] Steelman, Alan: no candidates with last name 'Steelman'
[fail-firstname] Dixon, Alan J.: no candidates with last name 'Dixon' and first name 'Alan'
[fail

In [11]:
print("Number of matched members:", mat.ref_name.nunique())
print("Number of unmatched members:", unmat.ref_name.nunique())
print("Frequency counts of reasons for non-matching:\n", unmat.match.value_counts())

Number of matched members: 1690
Number of unmatched members: 904
Frequency counts of reasons for non-matching:
 match
fail-lastname     327
fail-firstname    323
only-surname      151
ambiguous         103
Name: count, dtype: int64


In [12]:
ref["end_year"] = ref["end_year"].replace("Present", 9999)
ref["end_year"] = ref["end_year"].astype(int)
print("Number of members after 1992:", ref[ref.end_year>=1992].name.nunique())

Number of members after 1992: 1917
