# Notebook to add co-ordinates for 1999 Polling Places

The AEC started putting co-ordinates on polling place files from 2007.

The code below matches to 2007 polling places, where the name of the venue is the same.

While not perfect, this should result the vast majority of the time in a co-ordinate closely representing the location of the polling location in 1999

### Libraries

In [135]:
import pandas as pd
import numpy as np

### Helper functions

In [73]:
def left_of_bracket(s):
    if '(' in s:
        needle = s.find('(')
        r = s[:needle-1].strip()
        return r
    else:
        return s

### Import polling places

`import_polling_places(filepath)`
* Takes a path to a polling place file
* Returns a tidy data frame
    * Renames some columns
    * Dedups on `['state','polling_place']`

In [181]:
def import_polling_places(filepath):
    
    # read csv
    df_pp = pd.read_csv(
        filepath
    )
    
    # pick the columns I want to keep
    cols = [
        'State',
        'PollingPlaceNm',
        'PremisesNm',
        'PremisesAddress1',
        'PremisesAddress2',
        'PremisesAddress3',
        'PremisesSuburb',
        'PremisesStateAb',
        'PremisesPostCode',
        'Latitude',
        'Longitude'
    ]
    
    # filter for those
    df_pp = df_pp[cols]

    # create a polling place column missing the bracket
    lambda_polling_places = lambda x: left_of_bracket(x)
    df_pp['polling_place'] = df_pp['PollingPlaceNm'].apply(lambda_polling_places)

    # rename PremisesNm to premises to make joining easier
    df_pp['premises'] = df_pp['PremisesNm']

    # remove the old column
    del df_pp ['PollingPlaceNm']

    # replace in the col headers list
    cols = [c.replace('PollingPlaceNm', 'polling_place') for c in cols]
    cols = [c.replace('PremisesNm', 'premises') for c in cols]

    # reorder df
    df_pp = df_pp[cols]

    # dedup
    df_pp = df_pp.drop_duplicates()

    # make all headers lower case
    df_pp.columns = [x.lower() for x in df_pp.columns]

    print('num rows: ' +  str(len(df_pp.index)))
    
    return df_pp

In [188]:
# test - 2007

filepath = 'federal_election_polling_places/pp_2007_election.csv'
df_pp = import_polling_places(filepath)
df_pp.head(3)

num rows: 7861


Unnamed: 0,state,polling_place,premises,premisesaddress1,premisesaddress2,premisesaddress3,premisessuburb,premisesstateab,premisespostcode,latitude,longitude
0,NSW,Beverly Hills North,Beverly Hills North Public School,cnr Shorter Ave & King Georges Rd,,,BEVERLY HILLS,NSW,2209.0,-33.9413,151.075
1,NSW,East Hills,1st East Hills Scout Hall,cnr Henry Lawson Dr & MacLaurin Ave,,,EAST HILLS,NSW,2213.0,-33.9637,150.987
2,NSW,Hannans Road,Hannans Road Public School,Hannans Rd,,,RIVERWOOD,NSW,2210.0,-33.9459,151.058


### Import 1999 polling places

In [204]:
filepath = '1999_referenda_output/polling_places.csv'

df_pp_1999 = pd.read_csv(
    filepath
)

# add blank columns for match types and lat/lng
df_pp_1999['match_source'] = np.nan
df_pp_1999['match_type'] = np.nan
df_pp_1999['latitude'] = np.nan
df_pp_1999['longitude'] = np.nan

df_pp_1999.head(3)

Unnamed: 0,state,polling_place,premises,address,suburb,postcode,wheelchair_access,match_source,match_type,latitude,longitude
0,ACT,Bonython,Bonython Primary School,Hurtle Ave,BONYTHON,2905.0,F,,,,
1,ACT,Calwell,Calwell High School,Casey Cres,CALWELL,2905.0,F,,,,
2,ACT,Canberra Hospital,The Canberra Hospital,Blding 2 Level 3 Yamba Dr,GARRAN,2605.0,F,,,,


### Match on premises name

#### Pandas setting I need for below to behave
pandas generates warnings for working with a data frame that's a copy of another
it thinks I might think I'm changing df_pp_1999 when im working with df_pp_1999_working
i'm turning this warning off because i'm doing this on purpose, so I can keep df_pp_1999 as a 'yet to be matched' file, and update it with each subsequent working file

In [153]:
pd.set_option('chained_assignment',None)

#### Functions

In [198]:
def match_polling_places(df_pp_1999, df_pp, settings):
    
    # split up our meta field
    keys = settings['keys']
    match_source = settings['match_source']
    match_type = settings['match_type']

    # filter for those columns
    df_pp_1999_working = df_pp_1999[[
        'state',
        'polling_place',
        'premises',
        'address',
        'suburb',
        'postcode',
        'wheelchair_access'
    ]]
        
    # the keys I want to keep from the second df in the join are the group_by keys, and also lat/lng
    cols_df_pp = keys + ['latitude','longitude']
    
    # add cols for match type
    df_pp_1999_working['match_source'] = match_source
    df_pp_1999_working['match_type'] = match_type
    
    # run the join
    df_pp_1999_working = pd.merge(
        df_pp_1999_working,
        df_pp[cols_df_pp],
        on=['state','premises'],
        how='left'
    )
    
    # delete those which we didn't match
    df_pp_1999_working = df_pp_1999_working[~df_pp_1999_working['latitude'].isnull()]
    
    return df_pp_1999_working

def match_unmatched_polling_places(df_pp_1999, settings):
    
    # get polling place file from settings
    filepath = settings['pp_filepath']    
    df_pp = import_polling_places(filepath)
    
    # work out which rows we haven't yet matched
    df_pp_1999_unmatched = df_pp_1999[df_pp_1999.match_source.isnull()]

    # run match for those
    df_pp_1999_matches = match_polling_places(df_pp_1999_unmatched, df_pp, settings)
    
    # update df_pp_1999 to include new matches
    df_pp_1999.update(df_pp_1999_matches)
    
    # return
    return df_pp_1999

In [205]:
# Run merge on 2007 polling places, with premises name
settings = dict(
    pp_filepath = 'federal_election_polling_places/pp_2007_election.csv',
    keys = ['state','premises'],
    match_source = '2007 Polling Places',
    match_type = 'Premises Name'
)



In [206]:
df_pp_1999.head(10)

Unnamed: 0,state,polling_place,premises,address,suburb,postcode,wheelchair_access,match_source,match_type,latitude,longitude
0,ACT,Bonython,Bonython Primary School,Hurtle Ave,BONYTHON,2905.0,F,2007 Polling Places,Premises Name,-35.4318,149.083
1,ACT,Calwell,Calwell High School,Casey Cres,CALWELL,2905.0,F,2007 Polling Places,Premises Name,-35.4406,149.116
2,ACT,Canberra Hospital,The Canberra Hospital,Blding 2 Level 3 Yamba Dr,GARRAN,2605.0,F,2007 Polling Places,Premises Name,-35.3453,149.099
3,ACT,Chapman,Chapman Primary School,Perry Dr,CHAPMAN,2611.0,F,2007 Polling Places,Premises Name,-35.3564,149.042
4,ACT,Chifley,Melrose Primary School,Maclaurin Cres,CHIFLEY,2606.0,A,,,,
5,ACT,Chisholm,Chisholm Primary School,Hambidge Cres,CHISHOLM,2905.0,A,,,,
6,ACT,Conder,Charles Conder Primary School,Tom Roberts Ave,CONDER,2906.0,F,2007 Polling Places,Premises Name,-35.4649,149.098
7,ACT,Curtin,Curtin Primary School,Theodore St,CURTIN,2605.0,A,2007 Polling Places,Premises Name,-35.3317,149.08
8,ACT,Deakin,CCEGGS Senior School,Melbourne Ave,DEAKIN,2600.0,N,,,,
9,ACT,Duffy,Duffy Primary School,Burrinjuck Cres,DUFFY,2611.0,F,2007 Polling Places,Premises Name,-35.3347,149.032


In [208]:
# Run merge on 2010 polling places, with premises name

settings = dict(
    keys = ['state','premises'],
    match_source = '2010 Polling Places',
    match_type = 'Premises Name'
)

filepath = 'federal_election_polling_places/pp_2010_election.csv'
df_pp = import_polling_places(filepath)
df_pp.head(3)

# filter for just polling places we haven't yet matched

df_pp_1999_unmatched = df_pp_1999[df_pp_1999.match_source.isnull()]

df_pp_1999_matches = match_polling_places(df_pp_1999_unmatched, df_pp, settings)
df_pp_1999.update(df_pp_1999_matches)

num rows: 8248


In [210]:
df_pp_1999.head(200)

Unnamed: 0,state,polling_place,premises,address,suburb,postcode,wheelchair_access,match_source,match_type,latitude,longitude
0,ACT,Bonython,Bonython Primary School,Hurtle Ave,BONYTHON,2905.0,F,2007 Polling Places,Premises Name,-35.4318,149.083
1,ACT,Calwell,Calwell High School,Casey Cres,CALWELL,2905.0,F,2007 Polling Places,Premises Name,-35.4406,149.116
2,ACT,Canberra Hospital,The Canberra Hospital,Blding 2 Level 3 Yamba Dr,GARRAN,2605.0,F,2007 Polling Places,Premises Name,-35.3453,149.099
3,ACT,Chapman,Chapman Primary School,Perry Dr,CHAPMAN,2611.0,F,2007 Polling Places,Premises Name,-35.3564,149.042
4,ACT,Chifley,Melrose Primary School,Maclaurin Cres,CHIFLEY,2606.0,A,,,,
5,ACT,Chisholm,Chisholm Primary School,Hambidge Cres,CHISHOLM,2905.0,A,,,,
6,ACT,Conder,Charles Conder Primary School,Tom Roberts Ave,CONDER,2906.0,F,2007 Polling Places,Premises Name,-35.4649,149.098
7,ACT,Curtin,Curtin Primary School,Theodore St,CURTIN,2605.0,A,2007 Polling Places,Premises Name,-35.3317,149.080
8,ACT,Deakin,CCEGGS Senior School,Melbourne Ave,DEAKIN,2600.0,N,,,,
9,ACT,Duffy,Duffy Primary School,Burrinjuck Cres,DUFFY,2611.0,F,2007 Polling Places,Premises Name,-35.3347,149.032
