In [1]:
import re

import pandas as pd
from pandas import DataFrame as DF, Series

import numpy as np

import requests
from bs4 import BeautifulSoup

from pprint import pprint

In [2]:
baseurl = 'http://results.oregonvotes.gov'
u = '/ResultsSW.aspx?type=CTYALL&cty=26&map=CTY&eid=82'
url = baseurl + u

In [2]:
def extract_rows(div):
    rows = div.findAll('div', {'class': 'section group'})
    rows = [d for d in rows if not d.parent.attrs]
    rows = [
        d for d in rows if 
        d.parent.parent.attrs['class'] == ['wrapper-inside', 'wrapper-border']
    ]
    return rows

def extract_title(div):
    return div.find('h1').contents[0]

def toint(s):
    return int(''.join(s.split(',')))

def extract_table_data(rows):
    data = []
    for i,r in enumerate(rows):
        row = []
        if i == len(rows) - 1:
            row.append('total')
            row.append(toint(r.findAll('div')[-1].contents[0]))
        else:
            row.append(r.find('h1').contents[0].strip())
            row.append(toint(r.findAll('div')[-1].contents[0]))
        data.append(row)
    return data

def extract_all_data(divs):
    all_data = {}
    for i,div in enumerate(divs):
        title = extract_title(div)
        rows = extract_rows(div)
        try:
            data = extract_table_data(rows)
            all_data[title] = data
        except Exception as e:
            print(i)
            print(title)
            print(e)
    return all_data

def get_html(url):
    r = requests.get(url.replace('\n', ''))
    return r.content.decode('utf-8')

def extract_divs(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.findAll('div', {'class': ['wrapper-inside wrapper-border']})

def scrape_fancypage(url):
    html = get_html(url)
    divs = extract_divs(html)
    return extract_all_data(divs)


## Scrape All Fancy Pages

In [3]:
baseurl = 'http://results.oregonvotes.gov/ResultsSW.aspx?type=CTYALL&cty=26&map=CTY'
urls = {
    '': 'P2018',
    '&eid=4': 'S2017',  # special election
    '&eid=82': 'G2016',
}

In [4]:
all_fancypage_data = {}
for u,v in urls.items():
    url = baseurl + u
    key = v
    all_data = scrape_fancypage(url)
    all_fancypage_data[key] = all_data

88
Burlington Water Commissioner, Pos 3
'NoneType' object is not callable
89
Burlington Water Commissioner, Pos 5
'NoneType' object is not callable
98
Pleasant Home Water Comm, Pos 1
'NoneType' object is not callable
39
City of Wood Village Council, Pos 3
'NoneType' object is not callable
44
West Soil & Water, Director, At Large 1
'NoneType' object is not callable
51
Interlachen Water PUD Dir, Sub-Dist 5
'NoneType' object is not callable


In [5]:
list(all_fancypage_data['P2018'])

['City of Portland Auditor',
 'Multnomah Co Commissioner Dist #2',
 'Multnomah County Sheriff',
 'City of Portland Commissioner, Pos 3',
 '34-284 Beaverton School District #48JT Levy Renewal to Protect Beaverton Schools Teachers and Class Sizes',
 "26-197 CITY OF PORTLAND Renew Portland Children's Levy for five years.",
 'Metro Councilor, District 1',
 'Metro Councilor, District 2',
 'City of Portland Commissioner, Pos 2',
 'Multnomah County Auditor',
 'Metro Council President',
 'Metro Auditor',
 'Multnomah County Comm Chair']

In [6]:
all_fancypage_data['P2018']['Metro Auditor']

[['Brian Evans', 80933], ['Write-in', 969], ['total', 81902]]

## Scrape Odd 2016 Primary Page

In [7]:
url = 'https://multco.us/elections/may-2016-primary-election-results'
html = get_html(url)

In [8]:
def extract_oddpage_divs_titles(html):
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.findAll('div', {'class': ['views-row']})
    titles = [h.contents[0] for h in soup.findAll('h3')][2:]
    divs = [d for d in divs if d.parent.attrs['class'] == ['view-content']][1:]
    return divs, titles

def extract_rows(divs, titles):
    table_rows = {}
    first = True
    rows = []
    tix = 0
    for d in divs:
        if 'views-row-1' in d.attrs['class']:
            if not first:
                table_rows[titles[tix]] = rows
                tix += 1
            else:
                first = False
            rows = []
        rows.append(d)
    return table_rows

def extract_table_data(trows):
    data = []
    for i,r in enumerate(trows):
        divs = r.findAll('div')
        if i < len(trows) - 1:
            contents = [d.contents for d in divs[1:3]]
            name = contents[0][-1]
            votes = toint(contents[1][-1])
            data.append([name, votes])
        else:
            contents = [d.contents for d in divs[-2:]]
            a = contents[0][0].contents[0]
            a = re.sub(r':\s*', '', a)
            u = toint(contents[0][1])

            b = contents[1][0].contents[0]
            b = re.sub(r':\s*', '', b)
            v = toint(contents[1][1])
            data.append([a, u])
            data.append([b, v])
    return data

def scrape_oddpage(url):
    html = get_html(url)
    divs, titles = extract_oddpage_divs_titles(html)
    table_rows = extract_rows(divs, titles)
    all_data = {}
    for title, trows in table_rows.items():
        all_data[title] = extract_table_data(trows)
    return all_data

In [9]:
all_odd_data = scrape_oddpage(url)

In [10]:
all_odd_data

{'Attorney General (DEM)': [['Ellen Rosenblum', 118380],
  ['Write-in', 1649],
  ['Under Votes', 69944],
  ['Over Votes', 0]],
 'Attorney General (IND)': [['No candidate filed', 0],
  ['Write-in', 1070],
  ['Under Votes', 5844],
  ['Over Votes', 0]],
 'Attorney General (REP)': [['Daniel Zene Crowe', 17229],
  ['Write-in', 394],
  ['Under Votes', 19109],
  ['Over Votes', 0]],
 'City of Portland Commissioner, Pos 1': [['Lanita Duke', 16248],
  ['Amanda Fritz', 120225],
  ['Ann Sanderson', 18167],
  ['Tabitha Ivan', 5789],
  ['Sara Long', 4395],
  ['David Morrison', 7117],
  ['Write-in', 1396],
  ['Under Votes', 47262],
  ['Over Votes', 102]],
 'City of Portland Commissioner, Pos 4': [['Michael W Durrow', 8514],
  ['Leah Marie Dumas', 4048],
  ['Steve Novick', 72140],
  ['Suzanne Stahl', 11194],
  ['Fred Stewart', 13848],
  ['Joseph Puckett', 2047],
  ['James Bernard Lee', 4194],
  ['Chloe Eudaly', 25474],
  ['Shannon Estabrook', 2487],
  ['Stuart Emmons', 23735],
  ['Write-in', 1308],
  