In [1]:
# pandas for dataframes, robobrowser for scraping
# re (regular expressions) for pattern-matching (to find iframes that point at Google Docs)
import pandas as pd
from robobrowser import RoboBrowser
import re

In [2]:
# start a robobrowser instance; navigate to the mainedems.org caucus listings
browser = RoboBrowser()
browser.open('http://www.mainedems.org/page/find-your-caucus')

In [3]:
# find all the HTML <a> tags inside <div id="body"> in the navigable browser.parsed object
counties = (
    browser
    .parsed
    .find('div', id='body')
    .find_all('a')
)

In [4]:
# instantiate an empty list of tables
tables = []

# define the simple regex pattern for google docs -- 
# 'docs.google.org' must appear in the string
gdoc_pattern = re.compile(r'.*docs\.google\.com.*')

# iterate through the list of county <a> tags
for county in counties:
    
    # have the browser open the <a> link
    browser.follow_link(county)
    
    # find the <iframe> where the 'src' attribte matches gdoc_pattern
    iframe = browser.find('iframe', src=gdoc_pattern)
    
    # if there is such an iframe
    if iframe:
        
        # create a pandas dataframe by reading in its 'src' attribute URL
        # skipping an empty row, setting the next row as the header
        df = pd.read_html(iframe.attrs['src'],
                          header=0, skiprows=1)[0]
        
        # append it to the tables list
        tables.append(df)
        
    # go back to the main page
    browser.back()

In [5]:
# make a new pandas dataframe that's a concatentation of our list of dataframes
df = pd.concat(tables, ignore_index=True)

# display the first 5 rows
df.head()

Unnamed: 0,1,CONVENER,DOORS,LOCATION,MUNICIPAL,MUNICIPALITY,START,TIME
0,2,Ed Desgrosseilliers,,Auburn Middle School,,AUBURN,,1:00PM
1,3,Sarah Hall,,Durham Eureka Community Center,,DURHAM,,2:00PM
2,4,Matt Schlobohm and Kate Brennan,,Greene Town Office,,GREENE,,3:00PM
3,5,Richard Fochtmann,,Leeds Community Church,,LEEDS,,2:00PM
4,6,Richard Grandmaison,,Lewiston High School,,LEWISTON,,1:00PM


In [6]:
# iterate through the columns, dropping columns that are digits stored as strings
# (that ugly/unnecessary '1' column)
for col in df.columns:
    if col.isdigit():
        df.drop(col, axis=1, inplace=True)

In [7]:
# write the df to a csv, skipping the index (which is just an auto-incrementing number of no value to us)
df.to_csv('maine-caucus-locations-times.csv', index=False)