In [1]:
import numpy as np
import pandas as pd

In [2]:
bill_summaries = pd.read_csv('../data/bill_summaries.csv')
bioinfo = pd.read_csv('../data/bioinfo.csv')
fec_ids = pd.read_csv('../data/fec_ids.csv')
ideology = pd.read_csv('../data/ideology.csv')
sponsored_legislation = pd.read_csv('../data/sponsored_legislation.csv')
terms = pd.read_csv('../data/terms.csv')
vote_compare = pd.read_csv('../data/vote_compare.csv')
contrib = pd.read_csv('../data/contrib.csv')

1. Do each of the CSVs have primary keys? If so, what?

In [3]:
bill_summaries.columns

Index(['actionDate', 'actionDesc', 'currentChamber', 'currentChamberCode',
       'lastSummaryUpdateDate', 'text', 'updateDate', 'versionCode',
       'bill.congress', 'bill.number', 'bill.originChamber',
       'bill.originChamberCode', 'bill.title', 'bill.type',
       'bill.updateDateIncludingText', 'bill.url'],
      dtype='object')

In [13]:
bill_summaries[['bill.type', 'bill.number', 'versionCode']].duplicated().value_counts()

False    2751
Name: count, dtype: int64

In [4]:
bioinfo.columns

Index(['bioguide_id', 'Full name', 'Chamber', 'State', 'Party', 'District',
       'birthYear', 'image', 'Office address', 'Phone', 'Website'],
      dtype='object')

In [14]:
bioinfo[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [5]:
fec_ids.columns

Index(['bioguide_id', 'fec_id'], dtype='object')

In [15]:
fec_ids[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [6]:
ideology.columns

Index(['bioname', 'chamber', 'left_right_ideology', 'state_abbrev',
       'district_code', 'icpsr', 'bioguide_id', 'party'],
      dtype='object')

In [16]:
ideology[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [7]:
sponsored_legislation.columns

Index(['introducedDate', 'type', 'url', 'number', 'title', 'bioguide_id'], dtype='object')

In [25]:
sponsored_legislation[['url']].duplicated().value_counts()

False    14379
Name: count, dtype: int64

In [8]:
terms.columns

Index(['bioguide_id', 'chamber', 'congress', 'stateCode', 'startYear',
       'endYear', 'district'],
      dtype='object')

In [30]:
terms[['bioguide_id', 'chamber', 'congress']].duplicated().value_counts()

False    3257
Name: count, dtype: int64

In [9]:
vote_compare.columns

Index(['bioname', 'comparison_member', 'agree'], dtype='object')

In [31]:
vote_compare[['bioname', 'comparison_member']].duplicated().value_counts()

False    206040
Name: count, dtype: int64

In [3]:
contrib.columns

Index(['contributor_name', 'contributor_aggregate_ytd', 'memo_text', 'pdf_url',
       'fec_committee_id', 'fec_id'],
      dtype='object')

In [13]:
contrib.loc[contrib.duplicated(keep=False)].sort_values('pdf_url').head(10)

Unnamed: 0,contributor_name,contributor_aggregate_ytd,memo_text,pdf_url,fec_committee_id,fec_id
595599,"LOUKAS, GEORGE",2500.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
602339,"LOUKAS, GEORGE",2500.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
601825,"NOWOTNY, DONALD",300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
601824,"NOWOTNY, DONALD",300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
142219,"FETHERSTON, BARBARA",2900.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
136996,"FETHERSTON, BARBARA",2900.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
136985,"GREENSPAN, ROBB T.",4800.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
136986,"GREENSPAN, ROBB T.",4800.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
142212,"GREWAL, KULDIP",700.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
144091,"GREWAL, KULDIP",700.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149


In [6]:
contrib_clean = contrib.drop_duplicates().sort_values('pdf_url')
contrib_clean.head()

Unnamed: 0,contributor_name,contributor_aggregate_ytd,memo_text,pdf_url,fec_committee_id,fec_id
600548,"ARENSON, PETER",1300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
600174,"BIRMINGHAM, BARBARA",2000.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
601465,"BAUER, MICHAEL",250.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
602341,"CAMPBELL, THOMAS",400.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
600547,"BODEN, HANS",2300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096


In [12]:
contrib_clean[['pdf_url']].duplicated().value_counts()

True     378715
False    217665
Name: count, dtype: int64

2. Do any of the tables need to be separated into multiple tables (because it violates 2NF), or combined into one table (because they share the same primary key)?

In [57]:
members = pd.merge(bioinfo, fec_ids, how='outer', on='bioguide_id', validate='1:1', indicator='matched')
members['matched'].value_counts()

matched
both          545
left_only       0
right_only      0
Name: count, dtype: int64

In [58]:
members = members.drop('matched', axis=1)
members = pd.merge(members, ideology, how='outer', on='bioguide_id', validate='1:1', indicator='matched')
members['matched'].value_counts()

matched
both          545
left_only       0
right_only      0
Name: count, dtype: int64

In [59]:
members = members.drop('matched', axis=1)
members.head()

Unnamed: 0,bioguide_id,Full name,Chamber,State,Party,District,birthYear,image,Office address,Phone,Website,fec_id,bioname,chamber,left_right_ideology,state_abbrev,district_code,icpsr,party
0,A000055,Robert B. Aderholt,House of Representatives,Alabama,Republican,4.0,1965.0,https://www.congress.gov/img/member/a000055_20...,"272 Cannon House Office Building, Washington, ...",(202) 225-4876,https://aderholt.house.gov/,H6AL04098,"ADERHOLT, Robert",House,0.405,AL,4,29701,Republican
1,A000148,Jake Auchincloss,House of Representatives,Massachusetts,Democratic,4.0,1988.0,https://www.congress.gov/img/member/67817e391f...,"1524 Longworth House Office Building, Washingt...",(202) 225-5931,https://auchincloss.house.gov,H0MA04192,"AUCHINCLOSS, Jake",House,-0.288,MA,4,22100,Democrat
2,A000369,Mark E. Amodei,House of Representatives,Nevada,Republican,2.0,1958.0,https://www.congress.gov/img/member/a000369_20...,"104 Cannon House Office Building, Washington, ...",(202) 225-6155,https://amodei.house.gov,H2NV02395,"AMODEI, Mark E.",House,0.384,NV,2,21196,Republican
3,A000370,Alma S. Adams,House of Representatives,North Carolina,Democratic,12.0,1946.0,https://www.congress.gov/img/member/a000370_20...,"2436 Rayburn House Office Building, Washington...",(202) 225-1510,https://adams.house.gov,H4NC12100,"ADAMS, Alma",House,-0.462,NC,12,21545,Democrat
4,A000371,Pete Aguilar,House of Representatives,California,Democratic,33.0,1979.0,https://www.congress.gov/img/member/a000371_20...,"108 Cannon House Office Building, Washington, ...",(202) 225-3201,https://aguilar.house.gov/,H2CA31125,"AGUILAR, Peter Rey",House,-0.324,CA,33,21506,Democrat


In [60]:
members = members.drop(['Full name', 'Chamber', 'State', 'Party', 'District'], axis=1)
members.columns = [c.lower().replace(' ', '_') for c in members.columns]
members.head(3).T

Unnamed: 0,0,1,2
bioguide_id,A000055,A000148,A000369
birthyear,1965.0,1988.0,1958.0
image,https://www.congress.gov/img/member/a000055_20...,https://www.congress.gov/img/member/67817e391f...,https://www.congress.gov/img/member/a000369_20...
office_address,"272 Cannon House Office Building, Washington, ...","1524 Longworth House Office Building, Washingt...","104 Cannon House Office Building, Washington, ..."
phone,(202) 225-4876,(202) 225-5931,(202) 225-6155
website,https://aderholt.house.gov/,https://auchincloss.house.gov,https://amodei.house.gov
fec_id,H6AL04098,H0MA04192,H2NV02395
bioname,"ADERHOLT, Robert","AUCHINCLOSS, Jake","AMODEI, Mark E."
chamber,House,House,House
left_right_ideology,0.405,-0.288,0.384


In [61]:
members.to_csv('../data/3NF/members.csv', index=False)

In [45]:
bill_summaries.head(3).T

Unnamed: 0,0,1,2
actionDate,2025-10-15,2025-10-08,2025-10-08
actionDesc,Introduced in Senate,Introduced in House,Introduced in House
currentChamber,Senate,House,House
currentChamberCode,S,H,H
lastSummaryUpdateDate,2025-10-20T19:23:02Z,2025-10-20T14:56:29Z,2025-10-20T14:06:32Z
text,<p><strong>Shutdown Fairness Act</strong></p><...,<p><strong>Federal Worker Childcare Protection...,<p>This bill requires the federal government t...
updateDate,2025-10-20T19:23:15Z,2025-10-20T14:56:58Z,2025-10-20T14:07:00Z
versionCode,0,0,0
bill.congress,119,119,119
bill.number,3012,5720,5705


In [53]:
for col in bill_summaries.columns:
    print(col)
    df = bill_summaries.groupby(['bill.type', 'bill.number']).agg({col: 'nunique'})
    print(np.mean(df) == 1)

actionDate
False
actionDesc
False
currentChamber
False
currentChamberCode
False
lastSummaryUpdateDate
False
text
False
updateDate
False
versionCode
False
bill.congress
True
bill.number
True
bill.originChamber
True
bill.originChamberCode
True
bill.title
True
bill.type
True
bill.updateDateIncludingText
True
bill.url
True


In [55]:
bills = bill_summaries[['bill.type', 'bill.number', 'bill.congress', 'bill.originChamber', 'bill.originChamberCode', 'bill.title', 'bill.updateDateIncludingText', 'bill.url']].drop_duplicates()
bills.columns = [c.lower().replace(' ', '_') for c in bills.columns]
bills.to_csv('../data/3NF/bills.csv', index=False)

In [63]:
bill_versions = bill_summaries.drop(['bill.congress', 'bill.originChamber', 'bill.originChamberCode', 'bill.title', 'bill.updateDateIncludingText', 'bill.url'], axis=1)
bill_versions.columns = [c.lower().replace(' ', '_') for c in bill_versions.columns]
bill_versions.to_csv('../data/3NF/bill_versions.csv', index=False)

In [56]:
vote_compare.head(3).T

Unnamed: 0,0,1,2
bioname,"GRASSLEY, Charles Ernest","GRASSLEY, Charles Ernest","GRASSLEY, Charles Ernest"
comparison_member,"MARKEY, Edward John","SCHUMER, Charles Ellis (Chuck)","WYDEN, Ronald Lee"
agree,0.021053,0.07193,0.052632


In [62]:
vote_compare.to_csv('../data/3NF/vote_compare.csv', index=False)