# Notebook to add co-ordinates for 1999 Polling Places

The AEC started putting co-ordinates on polling place files from 2007.

The code below matches to 2007 polling places, where the name of the venue is the same.

While not perfect, this should result the vast majority of the time in a co-ordinate closely representing the location of the polling location in 1999

### Libraries

In [200]:
import pandas as pd
import numpy as np

### Helper functions

In [201]:
def left_of_bracket(s):
    if '(' in s:
        needle = s.find('(')
        r = s[:needle-1].strip()
        return r
    else:
        return s

### Import polling places

`import_polling_places(filepath)`
* Takes a path to a polling place file
* Returns a tidy data frame
    * Renames some columns
    * Dedups on `['state','polling_place']`

In [202]:
def import_polling_places(filepath):
    
    # read csv
    df_pp = pd.read_csv(
        filepath
    )
    
    # pick the columns I want to keep
    cols = [
        'State',
        'PollingPlaceNm',
        'PremisesNm',
        'PremisesAddress1',
        'PremisesAddress2',
        'PremisesAddress3',
        'PremisesSuburb',
        'PremisesStateAb',
        'PremisesPostCode',
        'Latitude',
        'Longitude'
    ]
    
    # filter for those
    df_pp = df_pp[cols]

    # create a polling place column missing the bracket
    lambda_polling_places = lambda x: left_of_bracket(x)
    df_pp['polling_place'] = df_pp['PollingPlaceNm'].apply(lambda_polling_places)

    # rename columns to make joining easier
    df_pp['premises'] = df_pp['PremisesNm']
    df_pp['postcode'] = df_pp['PremisesPostCode']

    # replace in the col headers list where I've modified/added the column
    cols = [c.replace('PollingPlaceNm', 'polling_place') for c in cols]
    cols = [c.replace('PremisesNm', 'premises') for c in cols]
    cols = [c.replace('PremisesPostCode', 'postcode') for c in cols]
    
    # reorder df
    df_pp = df_pp[cols]

    # dedup
    df_pp = df_pp.drop_duplicates()

    # make all headers lower case
    df_pp.columns = [x.lower() for x in df_pp.columns]
    
    return df_pp

In [203]:
# test above
filepath = 'federal_election_polling_places/pp_2007_election.csv'
df_pp = import_polling_places(filepath)
df_pp.head(3)

Unnamed: 0,state,polling_place,premises,premisesaddress1,premisesaddress2,premisesaddress3,premisessuburb,premisesstateab,postcode,latitude,longitude
0,NSW,Beverly Hills North,Beverly Hills North Public School,cnr Shorter Ave & King Georges Rd,,,BEVERLY HILLS,NSW,2209.0,-33.9413,151.075
1,NSW,East Hills,1st East Hills Scout Hall,cnr Henry Lawson Dr & MacLaurin Ave,,,EAST HILLS,NSW,2213.0,-33.9637,150.987
2,NSW,Hannans Road,Hannans Road Public School,Hannans Rd,,,RIVERWOOD,NSW,2210.0,-33.9459,151.058


### Import 1999 polling places

In [204]:
filepath = '1999_referenda_output/polling_places.csv'

df_pp_1999 = pd.read_csv(
    filepath
)

# add blank columns for match types and lat/lng
df_pp_1999['match_source'] = np.nan
df_pp_1999['match_type'] = np.nan
df_pp_1999['latitude'] = np.nan
df_pp_1999['longitude'] = np.nan

df_pp_1999 = df_pp_1999.set_index(['state','polling_place'])

df_pp_1999.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,premises,address,suburb,postcode,wheelchair_access,match_source,match_type,latitude,longitude
state,polling_place,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACT,Bonython,Bonython Primary School,Hurtle Ave,BONYTHON,2905.0,F,,,,
ACT,Calwell,Calwell High School,Casey Cres,CALWELL,2905.0,F,,,,
ACT,Canberra Hospital,The Canberra Hospital,Blding 2 Level 3 Yamba Dr,GARRAN,2605.0,F,,,,


# Matches

#### Pandas setting I need for below to behave
pandas generates warnings for working with a data frame that's a copy of another
it thinks I might think I'm changing df_pp_1999 when im working with df_pp_1999_working
i'm turning this warning off because i'm doing this on purpose, so I can keep df_pp_1999 as a 'yet to be matched' file, and update it with each subsequent working file

In [205]:
pd.set_option('chained_assignment',None)

#### Functions

In [206]:
def match_polling_places(df_pp_1999, df_pp, settings):
    
    # split up our meta field
    keys = settings['keys']
    match_source = settings['match_source']
    match_type = settings['match_type']

    # filter for those columns
 
    df_pp_1999_working = df_pp_1999.reset_index()[[
        'state',
        'polling_place',
        'premises',
        'address',
        'suburb',
        'postcode',
        'wheelchair_access'
    ]]

    # the keys I want to keep from the second df in the join are the group_by keys, and also lat/lng
    cols_df_pp = keys + ['latitude','longitude']
    
    # add cols for match type
    df_pp_1999_working['match_source'] = match_source
    df_pp_1999_working['match_type'] = match_type
    
    # run the join
    df_pp_1999_working = pd.merge(
        df_pp_1999_working,
        df_pp[cols_df_pp],
        on=keys,
        how='left'
    )
    
    # delete those which we didn't match
    df_pp_1999_working = df_pp_1999_working[~df_pp_1999_working['latitude'].isnull()]
    
    return df_pp_1999_working

def match_unmatched_polling_places(df_pp_1999, settings):
    
    # get polling place file from settings
    filepath = settings['pp_filepath']    
    df_pp = import_polling_places(filepath)
    
    # work out which rows we haven't yet matched
    df_pp_1999_unmatched = df_pp_1999[df_pp_1999.match_source.isnull()]

    # run match for those
    df_pp_1999_matches = match_polling_places(df_pp_1999_unmatched, df_pp, settings)
    
    # update df_pp_1999 to include new matches
#     df_pp_1999.update(df_pp_1999_matches)
    
    # return
    return df_pp_1999_matches

def match_status(df_pp_1999):
    match_status = df_pp_1999.groupby('match_type').count()
    match_status = match_status['state']
    unmatched = len(df_pp_1999[df_pp_1999['match_type'].isnull()].index)
    
    print(match_status)
    print('unmatched: ' + str(unmatched))

### Match attempts

#### Match 1 - 2007 on premises name, state, and postcode
* Other than schools that have moved, these places should be the same
    * And for schools that have moved, the postcode test should ensure it's not too far

In [207]:
settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2007_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2007 Polling Places',
    match_type = 'Match 01 - state, premises, postcode'
)

# run the match
df_pp_1999_matches = match_unmatched_polling_places(df_pp_1999, settings)




In [208]:
# match_status(df_pp_1999)

print(str(len(df_pp_1999_matches.index)))
# remove any non-unique combinations of state and polling place
# df_pp_1999_matches.drop_duplicates(subset=[ ])
print(str(len(df_pp_1999_matches.index)))
# put a key on state and polling place
# df_pp_1999_matches = df_pp_1999_matches.set_index(['state', 'polling_place'])

pd.concat(g for _, g in df_pp_1999_matches.groupby(['state','polling_place']) if len(g) > 1)

# df_pp_1999.update(df_pp_1999_matches)

# df_pp_1999

4891
4891


Unnamed: 0,state,polling_place,premises,address,suburb,postcode,wheelchair_access,match_source,match_type,latitude,longitude
448,NSW,Blacktown West,Blacktown West Public School,Kildare Rd,BLACKTOWN,2148.0,N,2007 Polling Places,"Match 01 - state, premises, postcode",-33.7705,150.893
975,NSW,Blacktown West,Blacktown West Public School,Lancaster St,BLACKTOWN,2148.0,N,2007 Polling Places,"Match 01 - state, premises, postcode",-33.7705,150.893
102,NSW,Punchbowl,Punchbowl Public School,1333 Canterbury Rd,PUNCHBOWL,2196.0,N,2007 Polling Places,"Match 01 - state, premises, postcode",-33.9317,151.057
103,NSW,Punchbowl,Punchbowl Public School,1333 Canterbury Rd,PUNCHBOWL,2196.0,N,2007 Polling Places,"Match 01 - state, premises, postcode",-33.9322,151.056
252,NSW,Punchbowl,Punchbowl Public School,1333 Canterbury Rd,PUNCHBOWL,2196.0,A,2007 Polling Places,"Match 01 - state, premises, postcode",-33.9317,151.057
253,NSW,Punchbowl,Punchbowl Public School,1333 Canterbury Rd,PUNCHBOWL,2196.0,A,2007 Polling Places,"Match 01 - state, premises, postcode",-33.9322,151.056
736,NSW,Sutton,Sutton Public School,Victoria St,SUTTON,2620.0,A,2007 Polling Places,"Match 01 - state, premises, postcode",-35.1648,149.254
1197,NSW,Sutton,Sutton Public School,Victoria St,SUTTON,2620.0,N,2007 Polling Places,"Match 01 - state, premises, postcode",-35.1648,149.254
475,NSW,Walters Road,Walters Road Public School,Walters Rd,BLACKTOWN,2148.0,N,2007 Polling Places,"Match 01 - state, premises, postcode",-33.7841,150.891
1007,NSW,Walters Road,Walters Road Public School,158 Walters Rd,BLACKTOWN,2148.0,N,2007 Polling Places,"Match 01 - state, premises, postcode",-33.7841,150.891


#### Match 2 through 4 - 2010 through 2016 on premises name, state, and postcode
* Other than schools that have moved, these places should be the same
    * And for schools that have moved, the postcode test should ensure it's not too far

In [138]:
# Run merge on 2010 polling places, with premises name
settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2010_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2010 Polling Places',
    match_type = 'Match 02 - state, premises, postcode'
)

df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2013_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2013 Polling Places',
    match_type = 'Match 03 - state, premises, postcode'
)

df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2016_election.csv',
    keys = ['state','premises','postcode'],
    match_source = '2016 Polling Places',
    match_type = 'Match 04 - state, premises, postcode'
)

df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)

KeyError: "['state'] not in index"

In [111]:
match_status(df_pp_1999)

match_type
Match 01 - state, premises, postcode    3686
Match 02 - state, premises, postcode     477
Match 03 - state, premises, postcode     512
Match 04 - state, premises, postcode     861
Name: state, dtype: int64
unmatched: 1514


In [113]:
df_pp_1999.to_csv(
    '1999_referenda_output/polling_places_geocoded.csv',
    sep = ','
)