- Janine Quigley and Jon Adamson
- 13 December 2022
- CS 181
- Dr. Asif

Loading in libraries

In [21]:
import pandas as pd
import requests
from lxml import etree
import io
import random
import sqlite3 as sql
import plotly.express as px
import numpy as np

Below is when we load in the .csv file and use Pandas to turn it into a dataframe. From this dataframe, we will get the policy area and status of a particular bill. This dataframe will be referenced later in the finalgraph() function.

In [22]:
data116 = pd.read_csv('house_legislation_116.csv')

#### Below is when we define all of the functions that we will use in the finalgraph() function.

In [23]:
def reader(bill):
  ''' 
  Reads xml links, checks response status code, and returns the roots of the tree
  Inputs: Number of bill whose vote data will be scraped
  Outputs: Returns the root of the tree
  
  '''

  xml = f'https://clerk.house.gov/evs/2019/roll{bill}.xml' #bill number must be 3 digits
  response = requests.get(xml)

  #assert response.status_code == 200

  response.content
  htmlparser = etree.HTMLParser()
  tree1 = etree.parse(io.BytesIO(response.content), htmlparser)
  root = tree1.getroot()
  return root

In [24]:
def varlist(root,var):
  '''
  Uses xpath notation to return the 'vote-data' element, whose children contain the data we want to scrape.
  Inputs: root: root that is returned from the reader(bill) function
          var: the rest of the xpath phrase that gaqthers information on the intended variable
  Outputs: The first item in the vote-data list with the children needed to get the rest of the data

  '''
  votedata = root.xpath('.//vote-data')
  votedata = votedata[0].xpath(f"./recorded-vote/{var}") # use var to choose which variable to grab

  return votedata

In [25]:
def listoflists(root):
  ''' 
  Creates a list of lists with the bill's House members, member state, member party, and member vote.
  Inputs: None
  Outputs: List of lists
  '''
  states = varlist(root,'legislator/@state')
  names = varlist(root,'legislator/@unaccented-name')
  party = varlist(root,'legislator/@party')
  vote = varlist(root,'vote/text()')
  data = [] #empty list
  data.append(names) # appending to make LoL
  data.append(states)
  data.append(party)
  data.append(vote)
  return data

In [26]:
def legnum(root):
    legisnumber = root.xpath('.//legis-num/text()')
    return legisnumber

In [27]:
def dataframe(lol):
  ''' 
  Creates a datafrom from a list of lists.
  Inputs: lol: a list of lists
  Outputs: pandas dataframe
  '''
  df = pd.DataFrame(lol)
  df = df.T
  df.set_axis(['Name','State','Party','Vote'],axis='columns',inplace=True) # setting axis names
  return df

In [28]:
def lolcreation(df):
  lol = []
  for i in range(len(df)):
    a = list(df.iloc[i])
    lol.append(tuple(a))
  s = ''
  for row in lol[:-1]:
    s += str(row) + ', '
  s = s+ str(lol[-1])
  return s


def insertdb(df,db,iqry,s):
  connection = sql.connect(db)
  cursor = connection.cursor()
  cursor.execute(iqry)
  connection.commit()

In [29]:
def sqlselect(db,sqry):
  connection = sql.connect(db)
  cursor = connection.cursor()
  result = cursor.execute(sqry)
  return result.fetchall()

In [30]:
def sqlcreate(db,cqry):
  connection = sql.connect(db)
  cursor = connection.cursor()
  cursor.execute(cqry)

In [31]:
def loop(numb):
  bills = []
  billnumber = []
  for i in range(numb):
    bills.append(random.randint(1,700))
  for item in bills:
    if item < 100:
      item = "0" + str(item)
    elif item < 10:
      item = '00' + str(item)
    billnumber.append(item)
  return billnumber

In [32]:
def loop2(root,billnumber):
    HR = []
    HRnum = []
    for item in billnumber:
      root = reader(item)
      number = legnum(root)
      HR.append(number)
    for item in HR:
      if len(item) == 1:
        item = item[0].replace(' ','.')
        HRnum.append(item)
    return HRnum

In [33]:
def loopinfo(HRnum):
  abdict = {'Area':[],'Status':[]}
  for item in HRnum:
    a = data116.loc[data116['bill_id'] == item]['policy_area']
    if a.empty==True:
      abdict['Area'].append('NA')
    else:
      for obj in a:
        abdict['Area'].append(obj)
  for item in HRnum:
    b = data116.loc[data116['bill_id'] == item]['bill_progress']
    if b.empty == True:
        abdict['Status'].append('NA')
    else:
      for obj in b:
        abdict['Status'].append(obj)
  return abdict

#### Final Analysis

In [35]:
db = 'test1.db'
def finalpopulate(n):
  '''
  Populates DB table according to bill number
  '''
  i = 0
  billnumber = ['043', 367, 487, 674, 234, 345, 341, 352, 525, 191]
  for item in billnumber:
    root = reader(item)
    lol = listoflists(root)
    df = dataframe(lol)
    HRnum = loop2(root,billnumber)
    cqry = f'''
  CREATE TABLE votetable{item}(
  Name VARCHAR(100) NOT NULL,
  State VARCHAR(100) NOT NULL,
  Party VARCHAR(100) NOT NULL,
  Vote VARCHAR(100) NOT NULL,
  PRIMARY KEY (Name)
  )
  '''
    sqlcreate(db,cqry)
    s = lolcreation(df)
    iqry = f'''INSERT INTO votetable{item} VALUES ''' + s
    insertdb(df,db,iqry,s)
    i = i+1
finalpopulate(10)




In [None]:
billnumber = ['043', 367, 487, 674, 234, 345, 341, 352, 525, 191]
def finalgraph(n): 
  '''
  Creates a graph for each bill that is accessed through the DB
  '''
  j = 0
  for item in billnumber:
    root = reader(item)
    HRnum = loop2(root,billnumber)
    sqry = f'SELECT * from "votetable{item}"'
    a = sqlselect(db,sqry)
    dataframe1 = pd.DataFrame(a)
    abdict = loopinfo(HRnum)
    print(abdict)
    fig1 = px.bar(dataframe1,x=3,color=2,title=f'Bill:'+ str(HRnum[j]) + ', Policy Area:' + 
                str(abdict['Area'][j])+', Status:' + str(abdict['Status'][j]))
    fig1.show()
    j = j+1

finalgraph(10)



>   After doing analysis on this .csv database and SQL databse, we have been able to make a lot of progress on answering our final question. We have noticed trends within policy areas and within political parties. 

> For the Democratic party, bills with the policy area "Public Lands and Natural Resources", "Labor and Employment", and "Immigration" were more likely to be approved and passed.

> For the Republican party, bills with the policy area "Crime and Law Enforcement", "Government Operations and Politics", and "Armed Forces and National Security" were more likely to be approved.

> We found that the policy areas with the highest frequency of parties voting the same way were "Economics and Public Finance", "International Affairs", and "Wealth"