### Import of data & packages

In [1]:
# import pandas for data cleaning and plotly for graphing
import pandas as pd
import plotly.graph_objects as go

In [2]:
# graphing functions to make below less difficult
def bubble(x, y, color, size, labels):
    data = [go.Scatter (
        x = x,
        y = y,
        text=labels,
        mode = 'markers',
        marker = dict(
            color=color,
            size=size,
            sizemode='area',
            sizeref=2.*max(size)/(45.**2),
            sizemin=4))]
    fig = go.Figure(data)
    fig.show()

def bar(x, y, colors):
    trace1 = go.Bar(
        x=x,
        y=y,
        marker_color=colors)
    fig = go.Figure([trace1])
    fig.show()
    
def barStack(labels, y1, y2):
    trace1 = go.Bar(
        x=labels,
        y=y1)
    trace2 = go.Bar(
        x=labels,
        y=y2)
    fig = go.Figure([trace1, trace2])
    fig.update_layout(barmode='stack')
    fig.show()    


### Get census data

In [3]:
# read in census data - notably not encoded in utf-8
init = pd.read_csv('CensusData2019.csv', encoding = "ISO-8859-1")
init.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952


In [4]:
# need the list function or it only does the first and last few
list(init.columns)

['SUMLEV',
 'REGION',
 'DIVISION',
 'STATE',
 'COUNTY',
 'STNAME',
 'CTYNAME',
 'CENSUS2010POP',
 'ESTIMATESBASE2010',
 'POPESTIMATE2010',
 'POPESTIMATE2011',
 'POPESTIMATE2012',
 'POPESTIMATE2013',
 'POPESTIMATE2014',
 'POPESTIMATE2015',
 'POPESTIMATE2016',
 'POPESTIMATE2017',
 'POPESTIMATE2018',
 'POPESTIMATE2019',
 'NPOPCHG_2010',
 'NPOPCHG_2011',
 'NPOPCHG_2012',
 'NPOPCHG_2013',
 'NPOPCHG_2014',
 'NPOPCHG_2015',
 'NPOPCHG_2016',
 'NPOPCHG_2017',
 'NPOPCHG_2018',
 'NPOPCHG_2019',
 'BIRTHS2010',
 'BIRTHS2011',
 'BIRTHS2012',
 'BIRTHS2013',
 'BIRTHS2014',
 'BIRTHS2015',
 'BIRTHS2016',
 'BIRTHS2017',
 'BIRTHS2018',
 'BIRTHS2019',
 'DEATHS2010',
 'DEATHS2011',
 'DEATHS2012',
 'DEATHS2013',
 'DEATHS2014',
 'DEATHS2015',
 'DEATHS2016',
 'DEATHS2017',
 'DEATHS2018',
 'DEATHS2019',
 'NATURALINC2010',
 'NATURALINC2011',
 'NATURALINC2012',
 'NATURALINC2013',
 'NATURALINC2014',
 'NATURALINC2015',
 'NATURALINC2016',
 'NATURALINC2017',
 'NATURALINC2018',
 'NATURALINC2019',
 'INTERNATIONALMIG201

In [18]:
# sumlev=40 means only state-level data and not county-level
# this pulls pop of each state into a df that's a lot more manageable
state_pops = init[init['SUMLEV'] == 40][['STNAME','POPESTIMATE2019']]


### Get senator data

In [None]:
# pull every table from the wikipedia page
sen_scrape = pd.read_html('https://en.wikipedia.org/wiki/List_of_current_United_States_senators')

In [32]:
# the fifth table is the one I want, so I pull that
# @TODO: make this robust against table order shifting

for table in sen_scrape:
    if (len(table) == 100):
        sen_init = table
        break


#sen_init = sen_scrape[5]

# and pare it down to the columns I want
sen_init = sen_init[['State', 'Senator', 'Party.1', 'Born', 'Assumed office', 'Term up']]
# @TODO: rename columns
sen_init

# ANALYSIS NOTE: Occupation / previous office / residence possibly useful later?

Unnamed: 0,State,Senator,Party.1,Born,Assumed office,Term up
0,Alabama,Richard Shelby,Republican[2],(age 86),"January 3, 1987",2022
1,Alabama,Tommy Tuberville,Republican,(age 66),"January 3, 2021",2026
2,Alaska,Lisa Murkowski,Republican,(age 63),"December 20, 2002[d]",2022
3,Alaska,Dan Sullivan,Republican,(age 56),"January 3, 2015",2026
4,Arizona,Kyrsten Sinema,Democratic,(age 44),"January 3, 2019",2024
...,...,...,...,...,...,...
95,West Virginia,Shelley Moore Capito,Republican,(age 67),"January 3, 2015",2026
96,Wisconsin,Ron Johnson,Republican,(age 65),"January 3, 2011",2022
97,Wisconsin,Tammy Baldwin,Democratic,(age 58),"January 3, 2013",2024
98,Wyoming,John Barrasso,Republican,(age 68),"June 25, 2007[y]",2024


In [19]:
# merge census data with senate data by state to get a final dataframe of info
sen_with_pop = sen_init.merge(state_pops, how='left', left_on='State', right_on='STNAME')

# some people have complicated relationships with their party; this removes the current footnotes
# @TODO: make robust against any footnote (fuzzy wuzzy?)
key = {"Republican[2]":"Republican", 'Independent[a]':'Independent'}
sen_with_pop.replace(key, inplace = True)

In [None]:
# @TODO: Make this unnecessary? Ideally would work for any # of parties with any names, just pull from one big dataset?
# that might be easier if plotted in pandas instead of plotly tbf

# Get lists of various variables so they can be easily plugged into plotly
R_data = sen_with_pop[sen_with_pop['Party.1'] == 'Republican']
R_data.head()
R_pops = list(R_data['POPESTIMATE2019'])
R_sen_list = list(R_data['Senator'])
R_states = list(R_data['State'])

D_data = sen_with_pop[sen_with_pop['Party.1'] != 'Republican']
D_data.head()
D_pops = list(D_data['POPESTIMATE2019']) # includes independents
D_sen_list = list(D_data['Senator'])
D_states = list(D_data['State'])

I_data = sen_with_pop[sen_with_pop['Party.1'] == 'Independent[a]']
I_pops = list(I_data['POPESTIMATE2019'])

In [None]:
# this builds the coordinates for a senate chamber setup
# @TODO: Republicans are currently on the LEFT

R_coords = []
for i in range(10):
    for j in range(5):
        R_coords.append([i, j])

D_coords = []
for i in range(10, 20):
    for j in range(5):
        D_coords.append([i, j])
        

In [None]:
# one-to-one list of coloring for any list going [republicans, democrats]
colors = ['red'] * 50 + ['blue'] * 50

# see this is just silly
full_sen_list = R_sen_list + D_sen_list
state_list = R_states + D_states

In [None]:
# bubble plot of population of each state, with mouseover labels for each senator
x_vals = [a[0] for a in (R_coords + D_coords)]
y_vals = [a[1] for a in (R_coords + D_coords)]
pops = [a for a in (R_pops + D_pops)]

bubble(x_vals, y_vals, colors, pops, full_sen_list)

In [None]:
# summary stats for each party
all_Rs = sum(R_pops)
all_Ds = sum(D_pops)
just_Is = sum(I_pops)

In [None]:
# SEE THIS IS THE WAY TO DO IT
# admittedly sorting it and relying on that sort might be a little weak? but GOOD

# anyway this is the data sorted by state population, lowest to highest
pop_sorted_data = sen_with_pop.sort_values(by='POPESTIMATE2019').reset_index(drop=True)
pop_sorted_data.head()

In [None]:
# check how many parties there are and what they are. and if any new footnotes are up
pop_sorted_data['Party.1'].unique()

In [None]:
# set up what party gets what colors and transform party column in data into a corresponding list of colors
colordict = {'Republican': 'red', 'Democratic':'blue', 'Independent':'gray', 'Vacant':'purple'}
colors = [colordict[k] for k in list(pop_sorted_data['Party.1'])]
# NOTE - make this robust re footnotes


In [None]:
# Stacked bar chart (for 2 senators per state)

# since data is sorted by population, senators from the same state must be adjacent
# so taking every other one gives one from each state
stack_1 = sen_with_pop[sen_with_pop.index % 2 == 0]
stack_2 = sen_with_pop[sen_with_pop.index % 2 == 1]

# colors stuff, as above
colors1 = [colordict[k] for k in list(stack_1['Party.1'])]
colors2 = [colordict[k] for k in list(stack_2['Party.1'])]


# bottom half of bar chart
trace1 = go.Bar(
    x=list(stack_1['State']),
    y=list(stack_1['POPESTIMATE2019']/2), # divide state pop in two so two senators makes the right pop come out
    # But don't want hovering to SHOW this
    marker_color = colors1,
    text=list(stack_1['State'])) # hovering shows state

# top half, same thing
trace2 = go.Bar(
    x=list(stack_1['State']),
    y=list(stack_2['POPESTIMATE2019']/2),
    marker_color=colors2,
    hovertext=list(stack_2['State'])) # maybe do abbreviations? 

# make chart!
fig = go.Figure([trace1])
fig.add_trace(trace2)

# add a line between 9 and 10 to show the halfway point
# @TODO: make robust against different distributions of population
fig.add_vline(x=9.5, line_dash='dot')

fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()    


In [None]:
# very simple graph showing how many people democrats vs republicans represent
mylabels = ['Republican', 'Democrat']

bar(mylabels, [all_Rs/2, all_Ds/2], ['red', 'blue']) #dividing by 2 because 2 senators

In [None]:
sort_new = sen_with_pop.sort_values(by='POPESTIMATE2019')
sort_new.head()

Rns = sort_new[sort_new['Party.1'] == 'Republican']
Dns = sort_new[sort_new['Party.1'] != 'Republican']


In [None]:
# JUST MAKE A POLAR CHART FFS

from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'polar'}]*2]*1, horizontal_spacing=0)

rep_coords = [(r, a*90 / (r+2)) for a in range(12) for r in range(max(5, a-2),10)]
dem_coords = [(r, 180 - a*90 / (r+2)) for a in range(12) for r in range(max(5, a-2),10)]

r_vals = [a[0] for a in dem_coords] + [a[0] for a in rep_coords]
theta_vals = [a[1] for a in dem_coords] + [a[1] for a in rep_coords]



fig.add_trace(go.Scatterpolar(
        r = [a[0] for a in dem_coords],
        theta = [a[1] for a in dem_coords],
        text=list(Dns['Senator']),
        mode = 'markers',
        marker = dict(
            color='blue',
            size=list(Dns['POPESTIMATE2019']),
            sizemode='area',
            sizeref=2.*40000000/(50.**2),
            sizemin=4)

    ), 1, 1)

fig.add_trace(go.Scatterpolar(
        r = [a[0] for a in rep_coords],
        theta = [a[1] for a in rep_coords],
        text = list(Rns['Senator']),
        mode = 'markers',
        marker = dict(
            color='red',
            size=list(Rns['POPESTIMATE2019']),
            sizemode='area',
            sizeref=2.*40000000/(50.**2),
            sizemin=4)
    ), 1, 2)



fig.update_layout(
    title = "US Senate",
    font_size = 15,
    showlegend = False,
    polar = dict(
        sector = [90,180],
        bgcolor = "white",
        angularaxis = dict(showline=False,showticklabels=False, ticks=''),
        radialaxis = dict(showline=False, showticklabels=False, ticks='')
    ),
    polar2 = dict(
        sector = [0, 90],
        bgcolor = "white",
        angularaxis = dict(showline=False,showticklabels=False, ticks=''),
        radialaxis = dict(showline = False,showticklabels=False, ticks='')
    ),
    paper_bgcolor = "white"
)



fig.show()

In [None]:

#sen_with_pop.to_csv('math.csv', index=False) # Read into a csv so can pull from that later