US_119_bill_sponsorships.csv DOWNLOADED FROM PLURAL

THE BELOW IS CODE USED TO PREPROCESS THE FILE AND MATCH THE REPRESENTATIVE WITH THEIR ASSOCIATED BIOGUIDE_ID AND THEN POPULATED THE ASSOCIATED COLUMNS

In [99]:
# Install rapidfuzz
!pip install rapidfuzz




In [100]:
# Import libraries
import pandas as pd
from rapidfuzz import process, fuzz


In [101]:
# Load data
sponsorships = pd.read_csv("US_119_bill_sponsorships.csv", encoding="latin1")
historical = pd.read_csv("legislators-historical.csv")
current = pd.read_csv("legislators-current.csv")


In [102]:
# Filter legislators born in 1900 or later
historical = historical[pd.to_datetime(historical['birthday'], errors='coerce') >= '1900-01-01']
current = current[pd.to_datetime(current['birthday'], errors='coerce') >= '1900-01-01']


In [103]:
# Tag source and prioritize current over historical
#historical['source'] = 'historical'
current['source'] = 'current'
legislators = pd.concat([current, historical], ignore_index=True)


In [104]:
# Clean name fields
sponsorships['name_clean'] = sponsorships['name'].str.lower().str.strip()
legislators['full_name_clean'] = legislators['full_name'].str.lower().str.strip()


In [105]:
# Fuzzy match names using rapidfuzz
def fuzzy_merge_fast(df_1, df_2, key1, key2, threshold=85):
    choices = df_2[key2].dropna().unique().tolist()
    matches = df_1[key1].apply(
        lambda x: process.extractOne(x, choices, scorer=fuzz.token_sort_ratio, score_cutoff=threshold)
    )
    df_1['matched_name'] = matches.apply(lambda x: x[0] if x else None)
    return df_1

sponsorships = fuzzy_merge_fast(sponsorships, legislators, 'name_clean', 'full_name_clean', threshold=85)


In [106]:
# Prepare unique legislator data, preferring current over historical
legislators_unique = legislators.drop_duplicates(subset='full_name_clean', keep='first')


In [107]:
# Merge matched rows only with legislator data
matched = sponsorships[sponsorships['matched_name'].notna()].copy()

matched = matched.merge(
    legislators_unique[['full_name_clean', 'bioguide_id', 'party', 'type', 'state', 'district']],
    left_on='matched_name',
    right_on='full_name_clean',
    how='left'
).drop(columns=['full_name_clean'])


In [108]:
# Ensure unmatched rows still have the necessary columns
unmatched = sponsorships[sponsorships['matched_name'].isna()].copy()
for col in ['bioguide_id', 'party', 'type', 'state', 'district']:
    unmatched[col] = None

# Combine matched and unmatched rows
final_df = pd.concat([matched, unmatched], ignore_index=True)


  final_df = pd.concat([matched, unmatched], ignore_index=True)


In [109]:
# Save final result with required columns
output_cols = [
    'name', 'bill_id', 'primary', 'classification',
    'matched_name', 'bioguide_id', 'party', 'type', 'state', 'district'
]

final_df[output_cols].to_csv("merged_bill_sponsorships.csv", index=False)
final_df[output_cols].head()


Unnamed: 0,name,bill_id,primary,classification,matched_name,bioguide_id,party,type,state,district
0,Rashida Tlaib,ocd-bill/32113334-4552-4e28-b5d4-98645b93e445,True,primary,rashida tlaib,T000481,Democrat,rep,MI,12.0
1,Mark Pocan,ocd-bill/32113334-4552-4e28-b5d4-98645b93e445,False,cosponsor,mark pocan,P000607,Democrat,rep,WI,2.0
2,Val T. Hoyle,ocd-bill/32113334-4552-4e28-b5d4-98645b93e445,False,cosponsor,val t. hoyle,H001094,Democrat,rep,OR,4.0
3,"JesÃºs G. ""Chuy"" GarcÃ­a",ocd-bill/32113334-4552-4e28-b5d4-98645b93e445,False,cosponsor,"jesús g. ""chuy"" garcía",G000586,Democrat,rep,IL,4.0
4,Paul Tonko,ocd-bill/32113334-4552-4e28-b5d4-98645b93e445,False,cosponsor,paul tonko,T000469,Democrat,rep,NY,20.0


THE BELOW CODE IS USED TO DETERMINE AUTHOR BILLS WITH BIPARTISAN SPONSOR (authors of bills with cross-party co-sponsors)



In [110]:
df = pd.read_csv("merged_bill_sponsorships.csv",  encoding="latin1")

In [111]:
# Drop rows with missing party information
df = df.dropna(subset=['party'])

In [112]:
# Get primary sponsors
primary_df = df[df['classification'] == 'primary'][['bill_id', 'name', 'party', 'bioguide_id']]
primary_df = primary_df.rename(columns={
    'name': 'primary_name',
    'party': 'primary_party',
    'bioguide_id': 'primary_bioguide_id'
})

In [113]:
# Get cosponsors
cosponsor_df = df[df['classification'] == 'cosponsor'][['bill_id', 'name', 'party']]

In [114]:
# Merge cosponsors with their bill's primary sponsor
merged_df = cosponsor_df.merge(primary_df, on='bill_id', how='left')

In [115]:
# Check if cosponsor is from a different party
merged_df['different_party'] = merged_df['party'] != merged_df['primary_party']

In [116]:
# Identify all primary sponsors (even with 0 cross-party cosponsors)
all_primary_sponsors = primary_df[['primary_name', 'primary_bioguide_id']].drop_duplicates()

In [117]:
# Find bills with at least one cross-party cosponsor
bills_with_diff_party = merged_df[merged_df['different_party']][['bill_id', 'primary_name', 'primary_bioguide_id']].drop_duplicates()

In [118]:
# Count per primary sponsor
cross_party_counts = bills_with_diff_party.groupby(['primary_name', 'primary_bioguide_id']).size().reset_index(name='num_bills_with_cross_party_cosponsors')

In [119]:
# Merge to include 0s
result = all_primary_sponsors.merge(cross_party_counts, on=['primary_name', 'primary_bioguide_id'], how='left')
result['num_bills_with_cross_party_cosponsors'] = result['num_bills_with_cross_party_cosponsors'].fillna(0).astype(int)

In [120]:
# Sort by count and assign rank
ranked_result = result.sort_values(by='num_bills_with_cross_party_cosponsors', ascending=False).reset_index(drop=True)
ranked_result['rank'] = ranked_result.index + 1

In [121]:
# Show result
print(ranked_result)
ranked_result.to_csv("ranked_cross_party_sponsorships.csv", index=False)

               primary_name primary_bioguide_id  \
0             Amy Klobuchar             K000367   
1          Marsha Blackburn             B001243   
2      Brian K. Fitzpatrick             F000466   
3    Catherine Cortez Masto             C001113   
4             Vern Buchanan             B001260   
..                      ...                 ...   
491         Robert Menendez             M001226   
492          Adam B. Schiff             S001150   
493         Yassamin Ansari             A000381   
494          Mary E. Miller             M001211   
495         Shomari Figures             F000481   

     num_bills_with_cross_party_cosponsors  rank  
0                                       22     1  
1                                       21     2  
2                                       21     3  
3                                       18     4  
4                                       18     5  
..                                     ...   ...  
491                           

THE BELOW CODE IS FOR EACH REPRESENTATIVE, HOW MANY BILLS THEY HAVE COSPONSORED WHERE THE PRIMARY SPONSOR IS FROM THE OPPOSITE PARTY(cosponsors of bills by other party)

In [133]:
# Load the dataset
df = pd.read_csv("merged_bill_sponsorships.csv",  encoding="latin1") 

In [134]:
# Drop rows with missing party info
df = df.dropna(subset=['party'])

In [135]:
# Get primary sponsors
primary_df = df[df['classification'] == 'primary'][['bill_id', 'party']]
primary_df = primary_df.rename(columns={'party': 'primary_party'})

In [136]:
# Get cosponsors (with name + bioguide_id)
cosponsor_df = df[df['classification'] == 'cosponsor'][['bill_id', 'name', 'party', 'bioguide_id']]


In [137]:
# Merge cosponsors with primary sponsor info
merged_df = cosponsor_df.merge(primary_df, on='bill_id', how='left')

In [138]:
# Check if cosponsor is from a different party
merged_df['different_party'] = merged_df['party'] != merged_df['primary_party']

In [139]:
# Get all cosponsors (even those with 0 cross-party cases)
all_cosponsors = cosponsor_df[['name', 'bioguide_id']].drop_duplicates()

In [140]:
# Get counts of cross-party cosponsorships
cross_party_df = merged_df[merged_df['different_party']][['name', 'bioguide_id', 'bill_id']].drop_duplicates()
cross_party_counts = cross_party_df.groupby(['name', 'bioguide_id']).size().reset_index(name='num_cross_party_cosponsored_bills')

In [141]:
# Merge to include cosponsors with 0 counts
result = all_cosponsors.merge(cross_party_counts, on=['name', 'bioguide_id'], how='left')
result['num_cross_party_cosponsored_bills'] = result['num_cross_party_cosponsored_bills'].fillna(0).astype(int)

In [142]:
# Sort by count
ranked_result = result.sort_values(by='num_cross_party_cosponsored_bills', ascending=False).reset_index(drop=True)

In [143]:
# Save to CSV
ranked_result.to_csv("ranked_cosponsors_cross_party_bills.csv", index=False)
print(ranked_result)

                     name bioguide_id  num_cross_party_cosponsored_bills
0    Brian K. Fitzpatrick     F000466                                182
1         Donald G. Davis     D000230                                109
2           Jimmy Panetta     P000613                                 92
3          Michael Lawler     L000599                                 64
4           Amy Klobuchar     K000367                                 61
..                    ...         ...                                ...
527             Ron Estes     E000298                                  0
528             Ben Cline     C001118                                  0
529          Ashley Moody     M001244                                  0
530          Julia Letlow     L000595                                  0
531          Mike Johnson     J000299                                  0

[532 rows x 3 columns]
