### Import of data & packages

In [1]:
# import pandas for data cleaning and plotly for graphing
import pandas as pd

### Get census data

In [2]:
# read in census data - notably not encoded in utf-8
init = pd.read_csv('CensusData2019.csv', encoding = "ISO-8859-1")
init.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952


In [3]:
# need the list function or it only does the first and last few
list(init.columns)

['SUMLEV',
 'REGION',
 'DIVISION',
 'STATE',
 'COUNTY',
 'STNAME',
 'CTYNAME',
 'CENSUS2010POP',
 'ESTIMATESBASE2010',
 'POPESTIMATE2010',
 'POPESTIMATE2011',
 'POPESTIMATE2012',
 'POPESTIMATE2013',
 'POPESTIMATE2014',
 'POPESTIMATE2015',
 'POPESTIMATE2016',
 'POPESTIMATE2017',
 'POPESTIMATE2018',
 'POPESTIMATE2019',
 'NPOPCHG_2010',
 'NPOPCHG_2011',
 'NPOPCHG_2012',
 'NPOPCHG_2013',
 'NPOPCHG_2014',
 'NPOPCHG_2015',
 'NPOPCHG_2016',
 'NPOPCHG_2017',
 'NPOPCHG_2018',
 'NPOPCHG_2019',
 'BIRTHS2010',
 'BIRTHS2011',
 'BIRTHS2012',
 'BIRTHS2013',
 'BIRTHS2014',
 'BIRTHS2015',
 'BIRTHS2016',
 'BIRTHS2017',
 'BIRTHS2018',
 'BIRTHS2019',
 'DEATHS2010',
 'DEATHS2011',
 'DEATHS2012',
 'DEATHS2013',
 'DEATHS2014',
 'DEATHS2015',
 'DEATHS2016',
 'DEATHS2017',
 'DEATHS2018',
 'DEATHS2019',
 'NATURALINC2010',
 'NATURALINC2011',
 'NATURALINC2012',
 'NATURALINC2013',
 'NATURALINC2014',
 'NATURALINC2015',
 'NATURALINC2016',
 'NATURALINC2017',
 'NATURALINC2018',
 'NATURALINC2019',
 'INTERNATIONALMIG201

In [4]:
# sumlev=40 means only state-level data and not county-level
# this pulls pop of each state into a df that's a lot more manageable
state_pops = init[init['SUMLEV'] == 40][['STNAME','POPESTIMATE2019']]


### Get senator data

In [5]:
# pull every table from the wikipedia page
sen_scrape = pd.read_html('https://en.wikipedia.org/wiki/List_of_current_United_States_senators')

In [6]:
# pull the first table of length 100; unlikely that there would ever be a prior table with a hundred entries
for table in sen_scrape:
    if (len(table) == 100):
        sen_init = table
        break

# and pare it down to the columns we want
sen_init = sen_init[['State', 'Senator', 'Party.1', 'Born', 'Assumed office', 'Term up']]


# ANALYSIS NOTE: Occupation / previous office / residence possibly useful later?

### Combine data & clean

In [7]:
# merge census data with senate data by state to get a final dataframe of info
sen_with_pop = sen_init.merge(state_pops, how='left', left_on='State', right_on='STNAME')

# removes all footnotes (of the form [a]) with regex
sen_with_pop.replace("\[\w*\]","", inplace = True, regex = True)

# rename columns and remove duplicate
sen_with_pop.rename({'Party.1': 'Party', 'POPESTIMATE2019':'Population'}, inplace = True, axis=1)
sen_with_pop.drop('STNAME', axis=1, inplace=True)
sen_with_pop

Unnamed: 0,State,Senator,Party,Born,Assumed office,Term up,Population
0,Alabama,Richard Shelby,Republican,(age 86),"January 3, 1987",2022,4903185
1,Alabama,Tommy Tuberville,Republican,(age 66),"January 3, 2021",2026,4903185
2,Alaska,Lisa Murkowski,Republican,(age 63),"December 20, 2002",2022,731545
3,Alaska,Dan Sullivan,Republican,(age 56),"January 3, 2015",2026,731545
4,Arizona,Kyrsten Sinema,Democratic,(age 44),"January 3, 2019",2024,7278717
...,...,...,...,...,...,...,...
95,West Virginia,Shelley Moore Capito,Republican,(age 67),"January 3, 2015",2026,1792147
96,Wisconsin,Ron Johnson,Republican,(age 65),"January 3, 2011",2022,5822434
97,Wisconsin,Tammy Baldwin,Democratic,(age 58),"January 3, 2013",2024,5822434
98,Wyoming,John Barrasso,Republican,(age 68),"June 25, 2007",2024,578759


In [8]:
sen_with_pop.to_csv('SPopulation.csv', index=False) # Read into a csv so can pull from that later