In [1]:
import re

from IPython.display import display

import pandas as pd
from pandas import DataFrame as DF, Series

import numpy as np

import requests
from bs4 import BeautifulSoup

In [2]:
def split_primary_parties(pre):
    parties = {}
    for i,l in enumerate(pre):
        if re.search(r'\*\s\(', l):
            party = [
                s.strip().strip('()') for s in l.split('*') if s
            ][0]
            parties[i] = party
    return parties

def get_html(url):
    r = requests.get(url.replace('\n', ''))
    return r.content.decode('utf-8')

def extract_pre(html):
    soup = BeautifulSoup(html, 'html.parser')
    pre = soup.find('pre')
    pre = pre.contents[0].split('\n')
    return [l.strip() for l in pre]

def extract_row(s):
    s = re.sub(r'\s+(\.\s+)+', '|', s)
    s = re.sub(r'\s{2,}', ' ', s)
    toks = s.split('|')
    return [toks[0], toks[1].split()[0]]

def scrape_rawpage(url, primary=False):
    html = get_html(url)
    pre = extract_pre(html)
    
    all_tables = {}
    parties = False
    if primary:
        parties = split_primary_parties(pre)
    party = None
    
    tables = {}
    title = None
    current_title = None
    parse = False
    for i,l in enumerate(pre):
        if l.startswith('VOTER TURNOUT'):
            parse = True
            continue
        if parties:
            if i in parties:
                if party:
                    all_tables[party] = tables
                party = parties[i]
                continue
        if not parse:
            continue
        if (l == '') or (re.search(r'Vote For\s+\d', l)):
            continue
        if not re.search(r'(\.\s{2})+', l):
            if title:
                title = ' '.join([title, l])
            else:
                title = l
        else:
            if title:
                current_title = title
            title = None
            tables.setdefault(current_title, [])
            # get row values
    #         print(l)
            row = extract_row(l)
            tables[current_title].append(row)
    if primary:
        return all_tables
    return tables

## Scrape All Pages

In [3]:
baseurl = 'https://multco.us/elections/'
urls = [
    'Gnovember-2012-general-election-election-results',
    'Pmay-15-2012-primary-election-election-results',
    'Gnovember-2-2010-election-results',
    'Pmay-18-2010-election-results',
    'Smay-19-2009-election-results',  # special election
    'Gnovember-4-2008-election-results',
    'Pmay-20-2008-election-results',
    'Gnovember-7-2006-election-results',
    'Pmay-16-2006-election-results',
    'Gnovember-2-2004-election-results',
    'Pmay-18-2004-election-results',
]

In [4]:
all_tables = {}
for u in urls:
    if u.startswith('P'):
        primary = True
    else:
        primary = False
    url = baseurl + u[1:]
    tables = scrape_rawpage(url, primary)
    year = re.search(r'20(0\d|1[0-8]{1})', u).group()
    key = u[0] + year
    all_tables[key] = tables

In [5]:
list(all_tables)

['P2004',
 'P2006',
 'S2009',
 'P2012',
 'G2006',
 'G2008',
 'P2008',
 'G2012',
 'G2004',
 'G2010',
 'P2010']

In [6]:
all_tables['P2012']

{'Democrat': {'26-125 MULTNOMAH COUNTY LIBRARY': [['Yes', '128,814'],
   ['No.', '23,566'],
   ['Over Votes', '22'],
   ['Under Votes', '8,424']],
  '26-126 CITY OF PORTLAND': [['Yes', '95,904'],
   ['No.', '15,356'],
   ['Over Votes', '36'],
   ['Under Votes', '28,712']],
  '26-127 CITY OF PORTLAND': [['Yes', '87,732'],
   ['No.', '25,581'],
   ['Over Votes', '32'],
   ['Under Votes', '26,663']],
  '26-128 CITY OF PORTLAND': [['Yes', '94,824'],
   ['No.', '20,739'],
   ['Over Votes', '23'],
   ['Under Votes', '24,422']],
  '26-129 CITY OF PORTLAND': [['Yes', '95,642'],
   ['No.', '21,823'],
   ['Over Votes', '30'],
   ['Under Votes', '22,513']],
  '26-130 CITY OF PORTLAND': [['Yes', '92,564'],
   ['No.', '24,894'],
   ['Over Votes', '23'],
   ['Under Votes', '22,527']],
  '26-131 CITY OF PORTLAND': [['Yes', '87,341'],
   ['No.', '26,517'],
   ['Over Votes', '36'],
   ['Under Votes', '26,114']],
  '26-132 CITY OF PORTLAND': [['Yes', '100,276'],
   ['No.', '11,454'],
   ['Over Votes', '

In [7]:
def display_primary_tables(key='P2012'):
    for party,data in all_tables[key].items():
        print(party.upper())
        print()
        for title,d in data.items():
            print(title)
            display(DF(d, columns=['name','votes']))
        print(20*'-')
        print()
        
def display_tables(key='2010'):
    for title,data in all_tables[key].items():
        print(title)
        display(DF(data, columns=['name','votes']))

In [8]:
display_tables()

KeyError: '2010'