# Parsing Los Angeles County's precinct-level results from the 2014 general election.

In [1]:
import pandas as pd
import pdfplumber
import re

Load the PDF in PDFPlumber:

In [2]:
pdf = pdfplumber.open("2014-bulletin-first-10-pages.pdf")
print(len(pdf.pages))

10


Let's look at the first 15 characters on the first page of the PDF:

In [3]:
first_page = pdf.pages[0]

In [4]:
chars = pd.DataFrame(first_page.chars)
chars.head(15)

Unnamed: 0,adv,bottom,doctop,fontname,height,object_type,pageid,size,text,top,upright,width,x0,x1,y0,y1
0,5.363,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,P,17.955,True,5.373,34.56,39.933,587.686,594.045
1,2.677,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,r,17.955,True,2.682,39.84,42.523,587.686,594.045
2,4.47,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,e,17.955,True,4.479,42.48,46.959,587.686,594.045
3,4.02,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,c,17.955,True,4.028,46.919,50.946,587.686,594.045
4,1.785,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,i,17.955,True,1.788,50.879,52.667,587.686,594.045
5,4.47,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,n,17.955,True,4.479,52.678,57.156,587.686,594.045
6,4.02,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,c,17.955,True,4.028,57.116,61.144,587.686,594.045
7,2.235,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,t,17.955,True,2.239,61.077,63.316,587.686,594.045
8,2.235,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,,17.955,True,2.239,63.358,65.597,587.686,594.045
9,2.235,24.314,17.955,GATHEL+Helvetica,6.36,char,4,6.36,,17.955,True,2.239,65.639,67.878,587.686,594.045


## Extract the precint ID

The corresponding characters are about 37–44 pixels from the top, and on the left half of the page.

In [5]:
pd.DataFrame(first_page.crop((0, 37, first_page.width / 2, 44 )).chars)

Unnamed: 0,adv,bottom,doctop,fontname,height,object_type,pageid,size,text,top,upright,width,x0,x1,y0,y1
0,5.363,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,P,38.114,True,5.373,34.56,39.933,567.052,573.886
1,5.805,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,R,38.114,True,5.816,39.84,45.656,567.052,573.886
2,5.363,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,E,38.114,True,5.373,45.599,50.972,567.052,573.886
3,5.805,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,C,38.114,True,5.816,50.879,56.695,567.052,573.886
4,2.235,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,I,38.114,True,2.239,56.638,58.877,567.052,573.886
5,5.805,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,N,38.114,True,5.816,58.919,64.735,567.052,573.886
6,5.805,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,C,38.114,True,5.816,64.677,70.493,567.052,573.886
7,4.912,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,T,38.114,True,4.922,70.436,75.358,567.052,573.886
8,2.235,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,,38.114,True,2.239,75.358,77.597,567.052,573.886
9,2.235,44,38.114,GATHEL+Helvetica,5.886,char,4,6.36,,38.114,True,2.239,77.639,79.878,567.052,573.886


In [6]:
def get_precinct_id(page):
    cropped = page.crop((0, 37, page.width / 2, 44 ))
    text = "".join((c["text"] for c in cropped.chars))
    trimmed = re.sub(r" +", "|", text)
    return trimmed

In [7]:
for page in pdf.pages:
    print(get_precinct_id(page))

PRECINCT|0050003A|ACTON
PRECINCT|0050004B|ACTON
PRECINCT|0050005A|ACTON
PRECINCT|0050014A|ACTON
PRECINCT|0050059A|ACTON
PRECINCT|0050061A|ACTON
PRECINCT|0070001A|AGOURA
PRECINCT|0070004A|AGOURA
PRECINCT|0070004C|AGOURA
PRECINCT|0070006A|AGOURA


## We can do the same for the number of ballots cast

In [8]:
def get_ballots_cast(page):
    cropped = page.crop((0, 48, page.width / 3, 60))
    text = "".join((c["text"] for c in cropped.chars))
    count = int(text.split(" ")[0])
    return count

In [9]:
for page in pdf.pages:
    print(get_ballots_cast(page))

327
141
216
196
325
316
150
189
164
334


## ... and for the number of registered voters in each precinct

In [10]:
def get_registered_voters(page):
    cropped = page.crop((0, 62, page.width / 3, 74))
    text = "".join((c["text"] for c in cropped.chars))
    count = int(text.split(" ")[0])
    return count

In [11]:
for page in pdf.pages:
    print(get_registered_voters(page))

1100
602
753
723
1233
1170
916
759
386
1136


## Getting the results for each race is a bit trickier

The data representation isn't truly tabular, but it's structured enough to allow us to create tabular data from it. This function divides the first column of the result-listings into columns (explicitly defined, in pixels) and rows (separated by gutters of whitespace).

In [12]:
def get_results_rows(page):
    first_col = page.crop((0, 77, 212, page.height))
    table = first_col.extract_table(
        v=(0, 158, 180, 212),
        h="gutters",
        x_tolerance=1)
    return table

In [13]:
get_results_rows(first_page)

[['GOVERNOR', None, None],
 ['VOTER NOMINATED', None, None],
 ['NEEL KASHKARI', 'REP', '247'],
 ['EDMUND G BROWN', 'DEM', '69'],
 ['LIEUTENANT GOVERNOR', None, None],
 ['VOTER NOMINATED', None, None],
 ['GAVIN NEWSOM', 'DEM', '64'],
 ['RON NEHRING', 'REP', '247'],
 ['SECRETARY OF STATE', None, None],
 ['VOTER NOMINATED', None, None],
 ['PETE PETERSON', 'REP', '248'],
 ['ALEX PADILLA', 'DEM', '64'],
 ['CONTROLLER', None, None],
 ['VOTER NOMINATED', None, None],
 ['BETTY T YEE', 'DEM', '59'],
 ['ASHLEY SWEARENGIN', 'REP', '249'],
 ['TREASURER', None, None],
 ['VOTER NOMINATED', None, None],
 ['GREG CONLON', 'REP', '240'],
 ['JOHN CHIANG', 'DEM', '69'],
 ['ATTORNEY GENERAL', None, None],
 ['VOTER NOMINATED', None, None],
 ['RONALD GOLD', 'REP', '245'],
 ['KAMALA D HARRIS', 'DEM', '68'],
 ['INSURANCE COMMISSIONER', None, None],
 ['VOTER NOMINATED', None, None],
 ['TED GAINES', 'REP', '249'],
 ['DAVE JONES', 'DEM', '60'],
 ['STATE BD EQUALIZATION D1', None, None],
 ['VOTER NOMINATED', None,

Let's restructure that slightly, so that each row contains information about the relevant race:

In [14]:
def get_results_table(page):
    rows = get_results_rows(page)
    results = []
    race = None
    for row in rows:
        name, affil, votes = row
        if name == "VOTER NOMINATED": continue
        if votes == None:
            race = name
        else:
            results.append((race, name, affil, int(votes)))
    results_df = pd.DataFrame(results, columns=[ "race", "name", "party", "votes" ])
    return results_df

In [15]:
get_results_table(first_page)

Unnamed: 0,race,name,party,votes
0,GOVERNOR,NEEL KASHKARI,REP,247
1,GOVERNOR,EDMUND G BROWN,DEM,69
2,LIEUTENANT GOVERNOR,GAVIN NEWSOM,DEM,64
3,LIEUTENANT GOVERNOR,RON NEHRING,REP,247
4,SECRETARY OF STATE,PETE PETERSON,REP,248
5,SECRETARY OF STATE,ALEX PADILLA,DEM,64
6,CONTROLLER,BETTY T YEE,DEM,59
7,CONTROLLER,ASHLEY SWEARENGIN,REP,249
8,TREASURER,GREG CONLON,REP,240
9,TREASURER,JOHN CHIANG,DEM,69


From there, we can start to do some calculations:

In [16]:
def get_jerry_brown_pct(page):
    table = get_results_table(page)
    brown_votes = table[table["name"] == "EDMUND G BROWN"]["votes"].iloc[0]
    kashkari_votes = table[table["name"] == "NEEL KASHKARI"]["votes"].iloc[0]
    brown_prop = float(brown_votes) / (kashkari_votes + brown_votes)
    return (100 * brown_prop).round(1)

In [17]:
for page in pdf.pages:
    precinct_id = get_precinct_id(page)
    brown = get_jerry_brown_pct(page)
    print("{0}: {1}%".format(precinct_id, brown))

PRECINCT|0050003A|ACTON: 21.8%
PRECINCT|0050004B|ACTON: 16.8%
PRECINCT|0050005A|ACTON: 21.3%
PRECINCT|0050014A|ACTON: 21.8%
PRECINCT|0050059A|ACTON: 22.6%
PRECINCT|0050061A|ACTON: 23.1%
PRECINCT|0070001A|AGOURA: 50.7%
PRECINCT|0070004A|AGOURA: 40.0%
PRECINCT|0070004C|AGOURA: 61.1%
PRECINCT|0070006A|AGOURA: 60.9%


---

---

---