# Notebook to add co-ordinates for 1999 Polling Places

The AEC started putting co-ordinates on polling place files from 2007.

The code below matches to 2007 polling places, where the name of the venue is the same.

While not perfect, this should result the vast majority of the time in a co-ordinate closely representing the location of the polling location in 1999

### Libraries

In [254]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

### Helper functions

In [255]:
def left_of_bracket(s):
    if '(' in s:
        needle = s.find('(')
        r = s[:needle-1].strip()
        return r
    else:
        return s

# i keep forgetting the syntax for this so writing a wrapper
def dedup_df(df, keys, keep = False):
    
    # for a data frame, drop anything thats a duplicate
    # if you change keep to first, it'll keep the first row rather than none
    df_dedup = df.drop_duplicates(keys, keep)
    return df_dedup

### Import polling places

`import_polling_places(filepath)`
* Takes a path to a polling place file
* Returns a tidy data frame
    * Renames some columns
    * Dedups on `['state','polling_place']`

In [256]:
def import_polling_places(filepath):
    
    # read csv
    df_pp = pd.read_csv(
        filepath
    )
    
    # pick the columns I want to keep
    cols = [
        'State',
        'PollingPlaceNm',
        'PremisesNm',
        'PremisesAddress1',
        'PremisesAddress2',
        'PremisesAddress3',
        'PremisesSuburb',
        'PremisesStateAb',
        'PremisesPostCode',
        'Latitude',
        'Longitude'
    ]
    
    # filter for those
    df_pp = df_pp[cols]

    # create a polling place column missing the bracket
    lambda_polling_places = lambda x: left_of_bracket(x)
    df_pp['polling_place'] = df_pp['PollingPlaceNm'].apply(lambda_polling_places)

    # rename columns to make joining easier
    df_pp['premises'] = df_pp['PremisesNm']
    df_pp['postcode'] = df_pp['PremisesPostCode']

    # replace in the col headers list where I've modified/added the column
    cols = [c.replace('PollingPlaceNm', 'polling_place') for c in cols]
    cols = [c.replace('PremisesNm', 'premises') for c in cols]
    cols = [c.replace('PremisesPostCode', 'postcode') for c in cols]
    
    # reorder df
    df_pp = df_pp[cols]

    # dedup
    df_pp = df_pp.drop_duplicates()

    # make all headers lower case
    df_pp.columns = [x.lower() for x in df_pp.columns]
    
    return df_pp

In [257]:
filepath = 'federal_election_polling_places/pp_2007_election.csv'
test = import_polling_places(filepath)
display('Rows: ' + str(len(test.index)))

'Rows: 7861'

### Import 1999 polling places

In [258]:
def import_1999_pp(filepath):

    df_pp_1999 = pd.read_csv(
        filepath
    )

    # add blank columns for match types and lat/lng
    df_pp_1999['match_source'] = np.nan
    df_pp_1999['match_type'] = np.nan
    df_pp_1999['latitude'] = np.nan
    df_pp_1999['longitude'] = np.nan

    # tell it to index on state and polling place
    df_pp_1999 = df_pp_1999.set_index(['state','polling_place'])
    
    return df_pp_1999

In [259]:
filepath = '1999_referenda_output/polling_places.csv'
df_pp_1999 = import_1999_pp(filepath)
display(df_pp_1999.head(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,premises,address,suburb,postcode,wheelchair_access,match_source,match_type,latitude,longitude
state,polling_place,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACT,Bonython,Bonython Primary School,Hurtle Ave,BONYTHON,2905.0,F,,,,
ACT,Calwell,Calwell High School,Casey Cres,CALWELL,2905.0,F,,,,
ACT,Canberra Hospital,The Canberra Hospital,Blding 2 Level 3 Yamba Dr,GARRAN,2605.0,F,,,,


# Matches

#### Pandas setting I need for below to behave
pandas generates warnings for working with a data frame that's a copy of another
it thinks I might think I'm changing df_pp_1999 when im working with df_pp_1999_working
i'm turning this warning off because i'm doing this on purpose, so I can keep df_pp_1999 as a 'yet to be matched' file, and update it with each subsequent working file

In [260]:
pd.set_option('chained_assignment',None)

### Functions

#### `match_polling_places(df_pp_1999, df_pp, settings)`
* For the 1999 data frame, and a given other polling place data frame, and a set of settings, run a merge, and return the rows that matched based on the join you specified
<br />
E.g:
```
match_polling_places(
    df_pp_1999,
    df_pp,
    dict(
        keys = ['state','premises','postcode'],
        match_source = '2007 Polling Places',
        match_type = 'Match 01 - state, premises, postcode'   
    )
)
```
* runs a join on state, premises, and postcode between df1 and df2
* keeps a defined set of columns from df1
* adds the columns match_source and match_type, and sets their value
* replaces the latitude and longitude columns of df1 with those from df2
* returns this data frame, deleting all rows that didn't match from df1

In [295]:
def match_polling_places(df1, df2, settings):
    
    # split up our meta field
    keys = settings['keys']
    match_source = settings['match_source']
    match_type = settings['match_type']

    # filter for those columns
 
    df_working = df1.reset_index()[[
        'state',
        'polling_place',
        'premises',
        'address',
        'suburb',
        'postcode',
        'wheelchair_access'
    ]]

    # the keys I want to keep from the second df in the join are the group_by keys, and also lat/lng
    cols_df2 = keys + ['latitude','longitude']
    
    # add cols for match type
    df_working['match_source'] = match_source
    df_working['match_type'] = match_type
    
    # run the join
    df_working = pd.merge(
        df_working,
        df2[cols_df2],
        on=keys,
        how='left'
    )
    
    # delete those which we didn't match
    df_working = df_working[~df_working['latitude'].isnull()]
    
    # dedup on the keys we matched on
    df_working = dedup_df(df_working, keys)
    
    return df_working

In [283]:
# test match_polling_places
filepath = '1999_referenda_output/polling_places.csv'
df1 = import_1999_pp(filepath)

filepath2 = 'federal_election_polling_places/pp_2007_election.csv'
df2 = import_polling_places(filepath2)

test = match_polling_places(
  df1,
  df2,
  dict(
      keys = ['state','premises','postcode'],
      match_source = '2007 Polling Places',
      match_type = 'Match 01 - state, premises, postcode'   
  )
)

display(test.head(3))

Unnamed: 0,state,polling_place,premises,address,suburb,postcode,wheelchair_access,match_source,match_type,latitude,longitude
0,ACT,Bonython,Bonython Primary School,Hurtle Ave,BONYTHON,2905.0,F,2007 Polling Places,"Match 01 - state, premises, postcode",-35.4318,149.083
1,ACT,Calwell,Calwell High School,Casey Cres,CALWELL,2905.0,F,2007 Polling Places,"Match 01 - state, premises, postcode",-35.4406,149.116
2,ACT,Canberra Hospital,The Canberra Hospital,Blding 2 Level 3 Yamba Dr,GARRAN,2605.0,F,2007 Polling Places,"Match 01 - state, premises, postcode",-35.3453,149.1


#### `match_unmatched_polling_places(df1, settings)`
* This is a wrapper function for `match_polling_places`
* It will only pass data that is NOT yet matched in df1 to the match function, so that we keep track of at what point in our order we matched the data frame (rather than overriding each time it matches)
* This will matter as we do less high quality matches at the bottom of the pile

In [293]:
def match_unmatched_polling_places(df1, settings):
    
    # get polling place file from settings
    filepath = settings['pp_filepath']    
    df2 = import_polling_places(filepath)

    # work out which rows we haven't yet matched
    df1_unmatched = df1[df1.match_source.isnull()]
    
    # run match for those
    df1_matches = match_polling_places(df1_unmatched, df2, settings)
    
    # dedup this file for combinations of state/polling_place (my unique key)
    keys = ['state','polling_place']
    df1_matches = dedup_df(df1_matches, keys)
    
    # check that worked by making it a key now
    df1_matches = df1_matches.set_index(keys)
    
    # update with matches
    df1.update(df1_matches)
    
    # return
    return df1

#### `match_status(df1)`
* a function to tell me for a given data frame what the match status is

In [335]:
def match_status(df1):
    
    # how many Nans are in match_type?
    not_matched = len(df1[df1['match_type'].isnull()].index)
    
    # make a df for none
    none = pd.DataFrame(dict(
                match_source = 'Not yet matched',
                count = not_matched
    ), index=[0])
    
    if not_matched == len(df1.index): # if all values are not-matched
        return none
    else:
        df = pd.DataFrame(
            df1.groupby('match_source')['match_source'].count().reset_index(name='count')
        )
        
        # add the non-matched row
        df = df.append(none)

        return df

### Match attempts

#### Match 1 - 2007 on premises name, state, and postcode
* Other than schools that have moved, these places should be the same
* And for schools that have moved, the postcode test should ensure it's not too far

In [337]:
# first match attempt - set up file
filepath = '1999_referenda_output/polling_places.csv'
df_pp_1999 = import_1999_pp(filepath)

# double check none are somehow magically matched yet
display(match_status(df_pp_1999))

# configure match settings
settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2007_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2007 Polling Places',
    match_type = 'Match 01 - state, premises, postcode'
)

# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

# check results
display(match_status(df_pp_1999))
# dedup_df(df, keys, keep = False

Unnamed: 0,count,match_source
0,7044,Not yet matched


Unnamed: 0,count,match_source
0,4883,2007 Polling Places
0,2161,Not yet matched


In [340]:

display(match_status(df_pp_1999))

# configure match settings
settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2007_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2007 Polling Places',
    match_type = 'Match 01 - state, premises, postcode'
)

# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

# check results
display(match_status(df_pp_1999))
# dedup_df(df, keys, keep = False

Unnamed: 0,count,match_source
0,4883,2007 Polling Places
0,2161,Not yet matched


Unnamed: 0,count,match_source
0,4883,2007 Polling Places
0,2161,Not yet matched


In [None]:
# match_status(df_pp_1999)

print(str(len(df_pp_1999_matches.index)))
# remove any non-unique combinations of state and polling place
# df_pp_1999_matches.drop_duplicates(subset=[ ])
print(str(len(df_pp_1999_matches.index)))
# put a key on state and polling place
# df_pp_1999_matches = df_pp_1999_matches.set_index(['state', 'polling_place'])

pd.concat(g for _, g in df_pp_1999_matches.groupby(['state','polling_place']) if len(g) > 1)

# df_pp_1999.update(df_pp_1999_matches)

# df_pp_1999

#### Match 2 through 4 - 2010 through 2016 on premises name, state, and postcode
* Other than schools that have moved, these places should be the same
    * And for schools that have moved, the postcode test should ensure it's not too far

In [None]:
# Run merge on 2010 polling places, with premises name
settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2010_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2010 Polling Places',
    match_type = 'Match 02 - state, premises, postcode'
)

df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2013_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2013 Polling Places',
    match_type = 'Match 03 - state, premises, postcode'
)

df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2016_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2016 Polling Places',
    match_type = 'Match 04 - state, premises, postcode'
)

df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

In [None]:
match_status(df_pp_1999)

In [None]:
df_pp_1999.to_csv(
    '1999_referenda_output/polling_places_geocoded.csv',
    sep = ','
)