## Extract - Web Data Scrape

In [9]:
# Import libraries and dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import time
import pathlib

In [3]:
# Find path to state csv
file = pathlib.Path("resources/states.csv")

In [4]:
# Read into state csv and convert abbrev into list
states_df = pd.read_csv(file)
state_abbrev = states_df['Code'].values.tolist()
state_abbrev


['AL',
 'AK',
 'AZ',
 'AR',
 'CA',
 'CO',
 'CT',
 'DE',
 'DC',
 'FL',
 'GA',
 'HI',
 'ID',
 'IL',
 'IN',
 'IA',
 'KS',
 'KY',
 'LA',
 'ME',
 'MD',
 'MA',
 'MI',
 'MN',
 'MS',
 'MO',
 'MT',
 'NE',
 'NV',
 'NH',
 'NJ',
 'NM',
 'NY',
 'NC',
 'ND',
 'OH',
 'OK',
 'OR',
 'PA',
 'RI',
 'SC',
 'SD',
 'TN',
 'TX',
 'UT',
 'VT',
 'VA',
 'WA',
 'WV',
 'WI',
 'WY']

In [36]:
# Mac users
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
# !which chromedriver

In [5]:
# Mac users
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
# Windows users
# executable_path = {'executable_path': 'chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)

In [7]:
# Create empty list to append into
indicator_list = []

# Loop through each state
for state in state_abbrev:

    # Set URL to scrape
    url = 'https://elections.mit.edu/#state-' + state
    browser.visit(url)
    
    # Add time delay
    time.sleep(3)
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    
    # Retrieve row 
    section = soup.find('section', id='statebyYear')
    rows = section.find_all('tr', class_='indicatorRow')
    
    # Loop through each row to pull indicator and value elements
    for row in rows:
        indicator_dict = {}
        indicator_dict['state']  = state
        indicator_dict['indicator_name'] = row.find('td', class_='indicatorName').text
        indicator_dict['indicator_value'] = row.find('span', class_='valueLbl').text
        indicator_list.append(indicator_dict)
        
# Close the browser after scraping
browser.quit()


In [31]:
print(indicator_list)

[{'state': 'AK', 'indicator_name': 'Data Completeness', 'indicator_value': '100.00%'}, {'state': 'AK', 'indicator_name': 'Disability- or Illness-related Voting Problems', 'indicator_value': '12.19%'}, {'state': 'AK', 'indicator_name': 'Mail Ballots Rejected', 'indicator_value': '0.27%'}, {'state': 'AK', 'indicator_name': 'Mail Ballots Unreturned', 'indicator_value': '13.17%'}, {'state': 'AK', 'indicator_name': 'Military and Overseas Ballots Rejected', 'indicator_value': '7.93%'}, {'state': 'AK', 'indicator_name': 'Military and Overseas Ballots Unreturned', 'indicator_value': '12.36%'}, {'state': 'AK', 'indicator_name': 'Online Registration Available', 'indicator_value': 'Yes'}, {'state': 'AK', 'indicator_name': 'Postelection Audit Required', 'indicator_value': 'Yes'}, {'state': 'AK', 'indicator_name': 'Provisional Ballots Cast', 'indicator_value': '6.13%'}, {'state': 'AK', 'indicator_name': 'Provisional Ballots Rejected', 'indicator_value': '0.08%'}, {'state': 'AK', 'indicator_name': '

## Transform - Clean, Normalize

In [36]:
# Convert into dataframe
indicator_df = pd.DataFrame(indicator_list)
indicator_df.head()

Unnamed: 0,state,indicator_name,indicator_value
0,AK,Data Completeness,100.00%
1,AK,Disability- or Illness-related Voting Problems,12.19%
2,AK,Mail Ballots Rejected,0.27%
3,AK,Mail Ballots Unreturned,13.17%
4,AK,Military and Overseas Ballots Rejected,7.93%


In [49]:
# Pivot dataframe to normalize data
indicator_pivot = (indicator_df.pivot(index='state', columns='indicator_name', values='indicator_value').rename_axis(None, axis =1)).reset_index()
indicator_pivot.columns = [column.lower().replace(' ', '_') for column in indicator_pivot.columns]
indicator_pivot.rename(columns={'state': 'state_id'}, inplace=True)
indicator_pivot

Unnamed: 0,state_id,data_completeness,disability-_or_illness-related_voting_problems,mail_ballots_rejected,mail_ballots_unreturned,military_and_overseas_ballots_rejected,military_and_overseas_ballots_unreturned,online_registration_available,postelection_audit_required,provisional_ballots_cast,provisional_ballots_rejected,registration_or_absentee_ballot_problems,registrations_rejected,residual_vote_rate,turnout,voter_registration_rate,voting_information_lookup_tools_available,voting_wait_time
0,AK,100.00%,12.19%,0.27%,13.17%,7.93%,12.36%,Yes,Yes,6.13%,0.08%,5.42%,11.16%,0.83%,61.80%,87.50%,4,6.4 mins
1,AR,93.39%,20.19%,0.16%,7.95%,Incomplete Data,17.26%,No,No,0.41%,0.31%,4.65%,3.38%,0.63%,53.10%,83.71%,4,15.3 mins
2,AZ,98.56%,8.29%,0.40%,18.58%,2.70%,18.33%,Yes,Yes,3.77%,0.88%,6.35%,Incomplete Data,2.14%,56.22%,80.94%,4,5.7 mins
3,CA,99.13%,11.56%,0.40%,29.17%,4.74%,29.25%,Yes,Yes,8.95%,1.32%,7.86%,15.85%,2.55%,58.40%,79.77%,4,6.6 mins
4,CO,100.00%,7.06%,0.81%,22.17%,1.61%,40.22%,Yes,Yes,0.20%,0.05%,6.15%,3.30%,2.76%,72.09%,90.05%,4,2.4 mins
5,CT,88.24%,20.21%,0.15%,6.35%,1.79%,18.25%,Yes,Yes,0.00%,0.00%,3.29%,Incomplete Data,1.85%,65.43%,86.55%,4,6.1 mins
6,DC,100.00%,20.78%,0.01%,22.17%,4.05%,18.98%,Yes,Yes,1.11%,0.47%,3.17%,0.50%,0.42%,61.11%,93.63%,5,16.3 mins
7,DE,100.00%,17.92%,0.05%,11.93%,4.00%,19.60%,Yes,Yes,0.07%,0.06%,4.04%,7.58%,0.32%,64.61%,86.99%,5,4.9 mins
8,FL,100.00%,13.16%,0.23%,21.71%,3.17%,27.67%,No,Yes,0.25%,0.14%,6.03%,6.02%,0.81%,65.74%,87.01%,3,5.5 mins
9,GA,100.00%,14.85%,0.33%,10.08%,8.14%,27.37%,Yes,No,0.40%,0.22%,6.04%,0.29%,0.58%,59.89%,83.30%,5,16.6 mins


## Load - Use Sqlalchemy into Postgres

In [None]:
!pip install psycopg2
!pip install sqlalchemy

In [23]:
# Import libraries and dependencies 
from sqlalchemy import create_engine
from config import username
from config import password

In [21]:
# Create engine connection
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/voting_db')
connection = engine.connect()

In [43]:
engine.table_names()

['indicator']

In [42]:
indicator_pivot.to_sql(name='indicator', con=engine, if_exists='append', index=False)