In [1]:
import pandas as pd

In [2]:
# pull every table from the wikipedia page
sen_scrape = pd.read_html('https://en.wikipedia.org/wiki/List_of_current_United_States_senators', attrs = {'id': 'senators'})

sen_scrape[0].head()

Unnamed: 0,State,Portrait,Senator,Party,Party.1,Born,Occupation(s),Previous electiveoffice(s),Education,Assumed office,Term up,Residence
0,Alabama,,Richard Shelby,,Republican[2],(age 87),Lawyer,U.S. HouseAlabama Senate,University of Alabama Birmingham School of Law...,"January 3, 1987",2022,Tuscaloosa[3]
1,Alabama,,Tommy Tuberville,,Republican,(age 67),"College football coachPartner, investment mana...",,Southern Arkansas University,"January 3, 2021",2026,Auburn
2,Alaska,,Lisa Murkowski,,Republican,(age 64),Lawyer,Alaska House of Representatives,Georgetown University Willamette University Co...,"December 20, 2002[d]",2022,Girdwood[4]
3,Alaska,,Dan Sullivan,,Republican,(age 56),U.S. Marine Corps officerLawyerAssistant Secre...,Alaska Attorney General,Culver Military Academy Harvard University Geo...,"January 3, 2015",2026,Anchorage[5]
4,Arizona,,Kyrsten Sinema,,Democratic,(age 45),Social workerPolitical activistLawyerCollege p...,U.S. HouseArizona SenateArizona House of Repre...,Brigham Young University,"January 3, 2019",2024,Phoenix[6]


In [3]:
sen_data = sen_scrape[0][['State', 'Senator', 'Party.1', 'Assumed office', 'Term up']]

state_abbrevs = pd.read_csv("state_abbrevs.csv")
state_pops = pd.read_csv('census20pop.csv',index_col=0)

In [4]:

# give sen_data an uppercased states column to join with state abbrevs matching col
sen_data['temp_upper'] = sen_data['State'].map(lambda x: x.upper())
sen_data = sen_data.join(state_abbrevs.set_index('states'), on="temp_upper").drop(['temp_upper'], axis=1)

# join with populations
sen_data = sen_data.join(state_pops.set_index('state'), on="State")

# rename
sen_data.rename({'Party.1': 'party', 
                'Assumed office':'assumed_office',
                'Term up': 'term_up'}, inplace = True, axis=1)

# removes all footnotes (of the form [a]) with regex
sen_data.replace("\[\w*\]","", inplace = True, regex = True)


sen_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sen_data['temp_upper'] = sen_data['State'].map(lambda x: x.upper())


Unnamed: 0,State,Senator,party,assumed_office,term_up,st,population
0,Alabama,Richard Shelby,Republican,"January 3, 1987",2022,AL,5024279
1,Alabama,Tommy Tuberville,Republican,"January 3, 2021",2026,AL,5024279
2,Alaska,Lisa Murkowski,Republican,"December 20, 2002",2022,AK,733391
3,Alaska,Dan Sullivan,Republican,"January 3, 2015",2026,AK,733391
4,Arizona,Kyrsten Sinema,Democratic,"January 3, 2019",2024,AZ,7151502
...,...,...,...,...,...,...,...
95,West Virginia,Shelley Moore Capito,Republican,"January 3, 2015",2026,WV,1793716
96,Wisconsin,Ron Johnson,Republican,"January 3, 2011",2022,WI,5893718
97,Wisconsin,Tammy Baldwin,Democratic,"January 3, 2013",2024,WI,5893718
98,Wyoming,John Barrasso,Republican,"June 25, 2007",2024,WY,576851


# Religion

In [5]:
religion_scrape = pd.read_html("https://en.wikipedia.org/wiki/Religious_affiliation_in_the_United_States_Senate", match="Senator")


In [6]:
relig_data = religion_scrape[0]

for r in religion_scrape[1:]:
    relig_data = relig_data.append(r, ignore_index = True)

# removes all footnotes (of the form [a]) with regex
relig_data.replace("\[\w*\]","", inplace = True, regex = True)

# GOD DAMN IT WIKIPEDIA
relig_data['Senator'].replace("Bob Casey", "Bob Casey Jr.", inplace=True)
relig_data['Senator'].replace("John Neely Kennedy", "John Kennedy", inplace=True)

relig_data

# TODO: replace some of the terms here with less wordy ones

Unnamed: 0,Senator,Party,State,Religion,Notes
0,Marsha Blackburn,Republican,Tennessee,Presbyterian,
1,John Barrasso,Republican,Wyoming,Presbyterian,Former Catholic.
2,Shelley Moore Capito,Republican,West Virginia,Presbyterian,
3,Tom Carper,Democratic,Delaware,Presbyterian,
4,Chris Coons,Democratic,Delaware,Presbyterian,Attends a Catholic church.
...,...,...,...,...,...
95,Mazie Hirono,Democratic,Hawaii,Buddhist(Jōdo Shinshū),Non-practicing.
96,Tammy Baldwin,Democratic,Wisconsin,Unknown/Refused to specify,Baptized as an Episcopalian.
97,Michael Bennet,Democratic,Colorado,Unknown/Refused to specify,Raised in both Jewish and Christian traditions.
98,Tammy Duckworth,Democratic,Illinois,Unknown/Refused to specify,Sometimes acknowledged as Deist.


In [7]:
relig_data.replace({
    'Buddhist.*': 'Buddhist',
    'Congregationalist.*': 'Congregationalist',
    'Unknown.*': 'Unknown',
    'Holiness.*': 'Holiness',
    'Restorationist.*': 'Restorationist'
}, regex=True, inplace=True)

In [8]:

sen_data = relig_data[['Senator', 'Religion']].merge(sen_data, on = "Senator", how="outer")
sen_data

Unnamed: 0,Senator,Religion,State,party,assumed_office,term_up,st,population
0,Marsha Blackburn,Presbyterian,Tennessee,Republican,"January 3, 2019",2024,TN,6910840
1,John Barrasso,Presbyterian,Wyoming,Republican,"June 25, 2007",2024,WY,576851
2,Shelley Moore Capito,Presbyterian,West Virginia,Republican,"January 3, 2015",2026,WV,1793716
3,Tom Carper,Presbyterian,Delaware,Democratic,"January 3, 2001",2024,DE,989948
4,Chris Coons,Presbyterian,Delaware,Democratic,"November 15, 2010",2026,DE,989948
...,...,...,...,...,...,...,...,...
95,Mazie Hirono,Buddhist,Hawaii,Democratic,"January 3, 2013",2024,HI,1455271
96,Tammy Baldwin,Unknown,Wisconsin,Democratic,"January 3, 2013",2024,WI,5893718
97,Michael Bennet,Unknown,Colorado,Democratic,"January 21, 2009",2022,CO,5773714
98,Tammy Duckworth,Unknown,Illinois,Democratic,"January 3, 2017",2022,IL,12812508


# Gender

In [9]:
# sigh. maybe find a better source?
women_senators = pd.read_html("https://en.wikipedia.org/wiki/Women_in_the_United_States_Senate", match="Prior experience")
women_list = list(women_senators[0]['Name'])

In [10]:
sen_data['gender'] = "Male"

for row in sen_data.iterrows():
    if row[1]['Senator'] in women_list:
        sen_data['gender'].iloc[row[0]] = "Female" # mmmMMM

sen_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Senator,Religion,State,party,assumed_office,term_up,st,population,gender
0,Marsha Blackburn,Presbyterian,Tennessee,Republican,"January 3, 2019",2024,TN,6910840,Female
1,John Barrasso,Presbyterian,Wyoming,Republican,"June 25, 2007",2024,WY,576851,Male
2,Shelley Moore Capito,Presbyterian,West Virginia,Republican,"January 3, 2015",2026,WV,1793716,Female
3,Tom Carper,Presbyterian,Delaware,Democratic,"January 3, 2001",2024,DE,989948,Male
4,Chris Coons,Presbyterian,Delaware,Democratic,"November 15, 2010",2026,DE,989948,Male
...,...,...,...,...,...,...,...,...,...
95,Mazie Hirono,Buddhist,Hawaii,Democratic,"January 3, 2013",2024,HI,1455271,Female
96,Tammy Baldwin,Unknown,Wisconsin,Democratic,"January 3, 2013",2024,WI,5893718,Female
97,Michael Bennet,Unknown,Colorado,Democratic,"January 21, 2009",2022,CO,5773714,Male
98,Tammy Duckworth,Unknown,Illinois,Democratic,"January 3, 2017",2022,IL,12812508,Female


In [11]:
sen_data['gender'].value_counts()

Male      76
Female    24
Name: gender, dtype: int64

# Race

In [12]:
sen_data['race'] = "White"

black_senators = ["Cory Booker", 'Tim Scott', 'Raphael Warnock']
asian_senators = ['Mazie Hirono', 'Tammy Duckworth']
hispanic_senators = ['Bob Menendez', 'Marco Rubio', 'Ted Cruz', 'Catherine Cortez Masto', 'Ben Ray Luján', 'Alex Padilla']

for row in sen_data.iterrows():
    if row[1]['Senator'] in black_senators:
        sen_data['race'].iloc[row[0]] = "Black" # mmmMMM
    if row[1]['Senator'] in asian_senators:
        sen_data['race'].iloc[row[0]] = "Asian American" # mmmMMM
    if row[1]['Senator'] in hispanic_senators:
        sen_data['race'].iloc[row[0]] = "Hispanic or Latino" # mmmMMM

print(sen_data['race'].value_counts())
sen_data

White                 89
Hispanic or Latino     6
Black                  3
Asian American         2
Name: race, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Senator,Religion,State,party,assumed_office,term_up,st,population,gender,race
0,Marsha Blackburn,Presbyterian,Tennessee,Republican,"January 3, 2019",2024,TN,6910840,Female,White
1,John Barrasso,Presbyterian,Wyoming,Republican,"June 25, 2007",2024,WY,576851,Male,White
2,Shelley Moore Capito,Presbyterian,West Virginia,Republican,"January 3, 2015",2026,WV,1793716,Female,White
3,Tom Carper,Presbyterian,Delaware,Democratic,"January 3, 2001",2024,DE,989948,Male,White
4,Chris Coons,Presbyterian,Delaware,Democratic,"November 15, 2010",2026,DE,989948,Male,White
...,...,...,...,...,...,...,...,...,...,...
95,Mazie Hirono,Buddhist,Hawaii,Democratic,"January 3, 2013",2024,HI,1455271,Female,Asian American
96,Tammy Baldwin,Unknown,Wisconsin,Democratic,"January 3, 2013",2024,WI,5893718,Female,White
97,Michael Bennet,Unknown,Colorado,Democratic,"January 21, 2009",2022,CO,5773714,Male,White
98,Tammy Duckworth,Unknown,Illinois,Democratic,"January 3, 2017",2022,IL,12812508,Female,Asian American


In [13]:
sen_data = sen_data.rename(columns={
    "Senator": "senator",
    "Religion": "religion",
    "State": "state",
    "st": "abbrev"
})
sen_data.head()

Unnamed: 0,senator,religion,state,party,assumed_office,term_up,abbrev,population,gender,race
0,Marsha Blackburn,Presbyterian,Tennessee,Republican,"January 3, 2019",2024,TN,6910840,Female,White
1,John Barrasso,Presbyterian,Wyoming,Republican,"June 25, 2007",2024,WY,576851,Male,White
2,Shelley Moore Capito,Presbyterian,West Virginia,Republican,"January 3, 2015",2026,WV,1793716,Female,White
3,Tom Carper,Presbyterian,Delaware,Democratic,"January 3, 2001",2024,DE,989948,Male,White
4,Chris Coons,Presbyterian,Delaware,Democratic,"November 15, 2010",2026,DE,989948,Male,White


In [16]:
sen_data.to_csv("Senate_Data.csv")
sen_data.to_json("../senate-react/src/data0/senators0.json", orient="records")

In [15]:
sen_data['religion'].unique()

array(['Presbyterian', 'Baptist', 'Methodist', 'Lutheran', 'Evangelical',
       'Episcopalian', 'Congregationalist', 'Restorationist', 'Holiness',
       'Quaker', 'Protestant', 'Catholic', 'Latter-day Saint', 'Jewish',
       'Buddhist', 'Unknown', 'Unaffiliated'], dtype=object)