In [106]:
import numpy as np
import pandas as pd

In [107]:
bill_summaries = pd.read_csv('../data/bill_summaries.csv')
bioinfo = pd.read_csv('../data/bioinfo.csv')
fec_ids = pd.read_csv('../data/fec_ids.csv')
ideology = pd.read_csv('../data/ideology.csv')
sponsored_legislation = pd.read_csv('../data/sponsored_legislation.csv')
terms = pd.read_csv('../data/terms.csv')
vote_compare = pd.read_csv('../data/vote_compare.csv')
contrib = pd.read_csv('../data/contrib.csv')

1. Do each of the CSVs have primary keys? If so, what?

In [108]:
bill_summaries.columns

Index(['actionDate', 'actionDesc', 'currentChamber', 'currentChamberCode',
       'lastSummaryUpdateDate', 'text', 'updateDate', 'versionCode',
       'bill.congress', 'bill.number', 'bill.originChamber',
       'bill.originChamberCode', 'bill.title', 'bill.type',
       'bill.updateDateIncludingText', 'bill.url'],
      dtype='object')

In [109]:
bill_summaries[['bill.type', 'bill.number', 'versionCode']].duplicated().value_counts()

False    2751
Name: count, dtype: int64

In [110]:
bioinfo.columns

Index(['bioguide_id', 'Full name', 'Chamber', 'State', 'Party', 'District',
       'birthYear', 'image', 'Office address', 'Phone', 'Website'],
      dtype='object')

In [111]:
bioinfo[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [112]:
fec_ids.columns

Index(['bioguide_id', 'fec_id'], dtype='object')

In [113]:
fec_ids[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [114]:
ideology.columns

Index(['bioname', 'chamber', 'left_right_ideology', 'state_abbrev',
       'district_code', 'icpsr', 'bioguide_id', 'party'],
      dtype='object')

In [115]:
ideology[['bioguide_id']].duplicated().value_counts()

False    545
Name: count, dtype: int64

In [116]:
sponsored_legislation.columns

Index(['introducedDate', 'type', 'url', 'number', 'title', 'bioguide_id'], dtype='object')

In [117]:
sponsored_legislation[['url']].duplicated().value_counts()

False    14379
Name: count, dtype: int64

In [118]:
terms.columns

Index(['bioguide_id', 'chamber', 'congress', 'stateCode', 'startYear',
       'endYear', 'district'],
      dtype='object')

In [119]:
terms[['bioguide_id', 'chamber', 'congress']].duplicated().value_counts()

False    3257
Name: count, dtype: int64

In [120]:
vote_compare.columns

Index(['bioname', 'comparison_member', 'agree'], dtype='object')

In [121]:
vote_compare[['bioname', 'comparison_member']].duplicated().value_counts()

False    206040
Name: count, dtype: int64

In [122]:
contrib.columns

Index(['contributor_name', 'contributor_aggregate_ytd', 'memo_text', 'pdf_url',
       'fec_committee_id', 'fec_id'],
      dtype='object')

In [123]:
contrib.loc[contrib.duplicated(keep=False)].sort_values('pdf_url').head(10)

Unnamed: 0,contributor_name,contributor_aggregate_ytd,memo_text,pdf_url,fec_committee_id,fec_id
595599,"LOUKAS, GEORGE",2500.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
602339,"LOUKAS, GEORGE",2500.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
601825,"NOWOTNY, DONALD",300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
601824,"NOWOTNY, DONALD",300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
142219,"FETHERSTON, BARBARA",2900.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
136996,"FETHERSTON, BARBARA",2900.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
136985,"GREENSPAN, ROBB T.",4800.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
136986,"GREENSPAN, ROBB T.",4800.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
142212,"GREWAL, KULDIP",700.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149
144091,"GREWAL, KULDIP",700.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109303...,C00462697,H0CA10149


In [124]:
contrib = contrib.drop_duplicates().sort_values('pdf_url')
contrib[['pdf_url']].duplicated().value_counts()

True     378715
False    217665
Name: count, dtype: int64

2. Do any of the tables need to be separated into multiple tables (because it violates 2NF), or combined into one table (because they share the same primary key)?

In [125]:
members = pd.merge(bioinfo, fec_ids, how='outer', on='bioguide_id', validate='1:1', indicator='matched')
members['matched'].value_counts()

matched
both          545
left_only       0
right_only      0
Name: count, dtype: int64

In [126]:
members = members.drop('matched', axis=1)
members = pd.merge(members, ideology, how='outer', on='bioguide_id', validate='1:1', indicator='matched')
members['matched'].value_counts()

matched
both          545
left_only       0
right_only      0
Name: count, dtype: int64

In [127]:
members = members.drop('matched', axis=1)
members.head()

Unnamed: 0,bioguide_id,Full name,Chamber,State,Party,District,birthYear,image,Office address,Phone,Website,fec_id,bioname,chamber,left_right_ideology,state_abbrev,district_code,icpsr,party
0,A000055,Robert B. Aderholt,House of Representatives,Alabama,Republican,4.0,1965.0,https://www.congress.gov/img/member/a000055_20...,"272 Cannon House Office Building, Washington, ...",(202) 225-4876,https://aderholt.house.gov/,H6AL04098,"ADERHOLT, Robert",House,0.405,AL,4,29701,Republican
1,A000148,Jake Auchincloss,House of Representatives,Massachusetts,Democratic,4.0,1988.0,https://www.congress.gov/img/member/67817e391f...,"1524 Longworth House Office Building, Washingt...",(202) 225-5931,https://auchincloss.house.gov,H0MA04192,"AUCHINCLOSS, Jake",House,-0.288,MA,4,22100,Democrat
2,A000369,Mark E. Amodei,House of Representatives,Nevada,Republican,2.0,1958.0,https://www.congress.gov/img/member/a000369_20...,"104 Cannon House Office Building, Washington, ...",(202) 225-6155,https://amodei.house.gov,H2NV02395,"AMODEI, Mark E.",House,0.384,NV,2,21196,Republican
3,A000370,Alma S. Adams,House of Representatives,North Carolina,Democratic,12.0,1946.0,https://www.congress.gov/img/member/a000370_20...,"2436 Rayburn House Office Building, Washington...",(202) 225-1510,https://adams.house.gov,H4NC12100,"ADAMS, Alma",House,-0.462,NC,12,21545,Democrat
4,A000371,Pete Aguilar,House of Representatives,California,Democratic,33.0,1979.0,https://www.congress.gov/img/member/a000371_20...,"108 Cannon House Office Building, Washington, ...",(202) 225-3201,https://aguilar.house.gov/,H2CA31125,"AGUILAR, Peter Rey",House,-0.324,CA,33,21506,Democrat


In [128]:
members = members.drop(['Full name', 'Chamber', 'State', 'Party', 'District'], axis=1)
members.columns = [c.lower().replace(' ', '_') for c in members.columns]
members.head(3).T

Unnamed: 0,0,1,2
bioguide_id,A000055,A000148,A000369
birthyear,1965.0,1988.0,1958.0
image,https://www.congress.gov/img/member/a000055_20...,https://www.congress.gov/img/member/67817e391f...,https://www.congress.gov/img/member/a000369_20...
office_address,"272 Cannon House Office Building, Washington, ...","1524 Longworth House Office Building, Washingt...","104 Cannon House Office Building, Washington, ..."
phone,(202) 225-4876,(202) 225-5931,(202) 225-6155
website,https://aderholt.house.gov/,https://auchincloss.house.gov,https://amodei.house.gov
fec_id,H6AL04098,H0MA04192,H2NV02395
bioname,"ADERHOLT, Robert","AUCHINCLOSS, Jake","AMODEI, Mark E."
chamber,House,House,House
left_right_ideology,0.405,-0.288,0.384


In [129]:
members.to_csv('../data/3NF/members.csv', index=False)

In [130]:
bill_summaries.head(3).T

Unnamed: 0,0,1,2
actionDate,2025-10-15,2025-10-08,2025-10-08
actionDesc,Introduced in Senate,Introduced in House,Introduced in House
currentChamber,Senate,House,House
currentChamberCode,S,H,H
lastSummaryUpdateDate,2025-10-20T19:23:02Z,2025-10-20T14:56:29Z,2025-10-20T14:06:32Z
text,<p><strong>Shutdown Fairness Act</strong></p><...,<p><strong>Federal Worker Childcare Protection...,<p>This bill requires the federal government t...
updateDate,2025-10-20T19:23:15Z,2025-10-20T14:56:58Z,2025-10-20T14:07:00Z
versionCode,0,0,0
bill.congress,119,119,119
bill.number,3012,5720,5705


In [131]:
for col in bill_summaries.columns:
    print(col)
    df = bill_summaries.groupby(['bill.type', 'bill.number']).agg({col: 'nunique'})
    print(np.mean(df) == 1)

actionDate
False
actionDesc
False
currentChamber
False
currentChamberCode
False
lastSummaryUpdateDate
False
text
False
updateDate
False
versionCode
False
bill.congress
True
bill.number
True
bill.originChamber
True
bill.originChamberCode
True
bill.title
True
bill.type
True
bill.updateDateIncludingText
True
bill.url
True


In [132]:
bills = bill_summaries[['bill.type', 'bill.number', 'bill.congress', 'bill.originChamber', 'bill.originChamberCode', 'bill.title', 'bill.updateDateIncludingText', 'bill.url']].drop_duplicates()
bills.columns = [c.lower().replace('.', '_') for c in bills.columns]
bills.to_csv('../data/3NF/bills.csv', index=False)

In [133]:
bill_versions = bill_summaries.drop(['bill.congress', 'bill.originChamber', 'bill.originChamberCode', 'bill.title', 'bill.updateDateIncludingText', 'bill.url'], axis=1)
bill_versions.columns = [c.lower().replace('.', '_') for c in bill_versions.columns]
bill_versions.to_csv('../data/3NF/bill_versions.csv', index=False)

In [134]:
vote_compare.head(3).T

Unnamed: 0,0,1,2
bioname,"GRASSLEY, Charles Ernest","GRASSLEY, Charles Ernest","GRASSLEY, Charles Ernest"
comparison_member,"MARKEY, Edward John","SCHUMER, Charles Ellis (Chuck)","WYDEN, Ronald Lee"
agree,0.021053,0.07193,0.052632


In [135]:
vote_compare.to_csv('../data/3NF/vote_compare.csv', index=False)

In [136]:
sponsored_legislation.head(10)

Unnamed: 0,introducedDate,type,url,number,title,bioguide_id
0,2025-09-10,,https://api.congress.gov/v3/amendment/119/hamd...,,,R000575
1,2025-09-09,,https://api.congress.gov/v3/amendment/119/hamd...,,,R000575
2,2025-09-09,,https://api.congress.gov/v3/amendment/119/hamd...,,,R000575
3,2025-09-09,,https://api.congress.gov/v3/amendment/119/hamd...,,,R000575
4,2025-09-09,,https://api.congress.gov/v3/amendment/119/hamd...,,,R000575
5,2025-09-09,,https://api.congress.gov/v3/amendment/119/hamd...,,,R000575
6,2025-06-25,HR,https://api.congress.gov/v3/bill/119/hr/4147?f...,4147.0,Poarch Band of Creek Indians Parity Act,R000575
7,2025-06-09,HR,https://api.congress.gov/v3/bill/119/hr/3838?f...,3838.0,Streamlining Procurement for Effective Executi...,R000575
8,2025-04-08,HR,https://api.congress.gov/v3/bill/119/hr/2740?f...,2740.0,To modify the boundaries of the Talladega Nati...,R000575
9,2025-03-31,HR,https://api.congress.gov/v3/bill/119/hr/2519?f...,2519.0,To provide a per diem allowance for Members of...,R000575


In [137]:
bills = pd.read_csv('../data/3NF/bills.csv')
bills.head()

Unnamed: 0,bill_type,bill_number,bill_congress,bill_originchamber,bill_originchambercode,bill_title,bill_updatedateincludingtext,bill_url
0,S,3012,119,Senate,S,Shutdown Fairness Act,2025-10-21,https://api.congress.gov/v3/bill/119/s/3012?fo...
1,HR,5720,119,House,H,Federal Worker Childcare Protection Act of 2025,2025-10-20,https://api.congress.gov/v3/bill/119/hr/5720?f...
2,HR,5705,119,House,H,To authorize the reimbursement by the Federal ...,2025-10-20,https://api.congress.gov/v3/bill/119/hr/5705?f...
3,S,2982,119,Senate,S,Federal Employees Civil Relief Act,2025-10-20,https://api.congress.gov/v3/bill/119/s/2982?fo...
4,S,2963,119,Senate,S,Fair Pay for Federal Contractors Act of 2025,2025-10-20,https://api.congress.gov/v3/bill/119/s/2963?fo...


In [138]:
bills[['bill_url']].duplicated().value_counts()

False    2735
Name: count, dtype: int64

In [139]:
all_bills = pd.merge(bills, sponsored_legislation, how='outer', left_on='bill_url', right_on='url', validate='1:1', indicator='matched')

In [140]:
all_bills['matched'].value_counts()

matched
right_only    11644
both           2735
left_only         0
Name: count, dtype: int64

In [141]:
all_bills.query('matched == "both"').head()

Unnamed: 0,bill_type,bill_number,bill_congress,bill_originchamber,bill_originchambercode,bill_title,bill_updatedateincludingtext,bill_url,introducedDate,type,url,number,title,bioguide_id,matched
4010,HCONRES,10.0,119.0,House,H,Emergency Border Control Resolution,2025-06-13,https://api.congress.gov/v3/bill/119/hconres/1...,2025-02-10,HCONRES,https://api.congress.gov/v3/bill/119/hconres/1...,10.0,Emergency Border Control Resolution,H001052,both
4011,HCONRES,11.0,119.0,House,H,Providing for a joint session of Congress to r...,2025-03-03,https://api.congress.gov/v3/bill/119/hconres/1...,2025-02-11,HCONRES,https://api.congress.gov/v3/bill/119/hconres/1...,11.0,Providing for a joint session of Congress to r...,S001212,both
4012,HCONRES,12.0,119.0,House,H,Supporting the Local Radio Freedom Act.,2025-10-18,https://api.congress.gov/v3/bill/119/hconres/1...,2025-02-13,HCONRES,https://api.congress.gov/v3/bill/119/hconres/1...,12.0,Supporting the Local Radio Freedom Act.,W000809,both
4014,HCONRES,14.0,119.0,House,H,Establishing the congressional budget for the ...,2025-10-09,https://api.congress.gov/v3/bill/119/hconres/1...,2025-02-18,HCONRES,https://api.congress.gov/v3/bill/119/hconres/1...,14.0,Establishing the congressional budget for the ...,A000375,both
4017,HCONRES,17.0,119.0,House,H,Authorizing the use of Emancipation Hall in th...,2025-06-12,https://api.congress.gov/v3/bill/119/hconres/1...,2025-03-05,HCONRES,https://api.congress.gov/v3/bill/119/hconres/1...,17.0,Authorizing the use of Emancipation Hall in th...,K000392,both


In [142]:
pd.Series(['amendment' in x for x in all_bills['url']]).value_counts()

False    10369
True      4010
Name: count, dtype: int64

In [143]:
all_bills = all_bills[['bill_type', 'bill_number', 'bill_congress', 'bill_originchamber', 'bill_originchambercode', 'bill_title', 'bill_updatedateincludingtext', 'introducedDate', 'url', 'bioguide_id']]
all_bills.to_csv('../data/3NF/bills.csv', index=False)

In [144]:
terms.head(10)

Unnamed: 0,bioguide_id,chamber,congress,stateCode,startYear,endYear,district
0,R000575,House of Representatives,108,AL,2003,2005,3.0
1,R000575,House of Representatives,109,AL,2005,2007,3.0
2,R000575,House of Representatives,110,AL,2007,2009,3.0
3,R000575,House of Representatives,111,AL,2009,2011,3.0
4,R000575,House of Representatives,112,AL,2011,2013,3.0
5,R000575,House of Representatives,113,AL,2013,2015,3.0
6,R000575,House of Representatives,114,AL,2015,2017,3.0
7,R000575,House of Representatives,115,AL,2017,2019,3.0
8,R000575,House of Representatives,116,AL,2019,2021,3.0
9,R000575,House of Representatives,117,AL,2021,2023,3.0


In [145]:
terms.groupby('congress').agg({'startYear': 'std', 'endYear': 'std'})

Unnamed: 0_level_0,startYear,endYear
congress,Unnamed: 1_level_1,Unnamed: 2_level_1
94,,
95,0.0,0.0
96,0.0,0.0
97,0.0,0.0
98,0.0,0.0
99,0.0,0.0
100,0.0,0.0
101,0.0,0.0
102,0.0,0.0
103,0.0,0.0


In [146]:
terms.query('congress == 111')['startYear'].value_counts()

startYear
2009    122
2010      2
Name: count, dtype: int64

In [147]:
terms.to_csv('../data/3NF/terms.csv', index=False)

In [148]:
contrib.head()

Unnamed: 0,contributor_name,contributor_aggregate_ytd,memo_text,pdf_url,fec_committee_id,fec_id
600548,"ARENSON, PETER",1300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
600174,"BIRMINGHAM, BARBARA",2000.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
601465,"BAUER, MICHAEL",250.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
602341,"CAMPBELL, THOMAS",400.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096
600547,"BODEN, HANS",2300.0,,http://docquery.fec.gov/cgi-bin/fecimg/?109300...,C00457556,H0IL05096


In [159]:
contrib.to_csv('../data/3NF/contrib.csv', index=False)

In [150]:
def map_pandas_dtype_to_dbml_type(dtype) -> str:
    """Maps a pandas dtype to a DBML type."""
    dtype_name = str(dtype)
    if "int" in dtype_name:
      return "int"
    if "float" in dtype_name:
      return "float"
    if "datetime" in dtype_name:
        return "datetime"
    return "varchar"

def pandas_df_to_dbml(df: pd.DataFrame, table_name: str) -> str:
    """
    Converts a pandas DataFrame to a DBML string.

    Args:
        df: The pandas DataFrame to convert.
        table_name: The name of the table in the DBML schema.

    Returns:
        A DBML string representing the DataFrame schema.
    """

    dbml_string = f"Table {table_name} {{\n"

    for column_name, column_type in df.dtypes.items():
        dbml_type = map_pandas_dtype_to_dbml_type(column_type)
        dbml_string += f"  {column_name} {dbml_type}\n"

    dbml_string += "}\n"
    return dbml_string

In [151]:
bill_versions = pd.read_csv('../data/3NF/bill_versions.csv')
print(pandas_df_to_dbml(bill_versions, 'bill_versions'))

Table bill_versions {
  actiondate varchar
  actiondesc varchar
  currentchamber varchar
  currentchambercode varchar
  lastsummaryupdatedate varchar
  text varchar
  updatedate varchar
  versioncode int
  bill_number int
  bill_type varchar
}



In [152]:
bills = pd.read_csv('../data/3NF/bills.csv')
print(pandas_df_to_dbml(bills, 'bills'))

Table bills {
  bill_type varchar
  bill_number float
  bill_congress float
  bill_originchamber varchar
  bill_originchambercode varchar
  bill_title varchar
  bill_updatedateincludingtext varchar
  introducedDate varchar
  url varchar
  bioguide_id varchar
}



In [153]:
members = pd.read_csv('../data/3NF/members.csv')
print(pandas_df_to_dbml(members, 'members'))

Table members {
  bioguide_id varchar
  birthyear float
  image varchar
  office_address varchar
  phone varchar
  website varchar
  fec_id varchar
  bioname varchar
  chamber varchar
  left_right_ideology float
  state_abbrev varchar
  district_code int
  icpsr int
  party varchar
}



In [154]:
terms = pd.read_csv('../data/3NF/terms.csv')
print(pandas_df_to_dbml(terms, 'terms'))

Table terms {
  bioguide_id varchar
  chamber varchar
  congress int
  stateCode varchar
  startYear int
  endYear int
  district float
}



In [155]:
vote_compare = pd.read_csv('../data/3NF/vote_compare.csv')
print(pandas_df_to_dbml(vote_compare, 'vote_compare'))

Table vote_compare {
  bioname varchar
  comparison_member varchar
  agree float
}

