Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jacksonj04 committed Jul 12, 2018
0 parents commit e311ad3
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
9 changes: 9 additions & 0 deletions requirements.txt
@@ -0,0 +1,9 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/python

# Custom version of scraperwiki library
-e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki

lxml==3.4.4
cssselect==0.9.1
1 change: 1 addition & 0 deletions runtime.txt
@@ -0,0 +1 @@
python-2.7.9
138 changes: 138 additions & 0 deletions scraper.py
@@ -0,0 +1,138 @@
# coding=utf-8

import scraperwiki
import lxml.html
import sqlite3
import re


def cleanup(string):

# Strip any annoying whitespace
string = string.strip()

# Lose any curled apostrophies
string = string.replace(u'’', '\'')

return string


PARTY_MAP = {
'Conservative': 'Q9626',
'Garforth & Swillington Independents': 'Q55465979',
'Green Party': 'Q9669',
'Labour': 'Q9630',
'Liberal Democrats': 'Q9624',
'Morley Borough Independents': 'Q55465915',
}

WARD_MAP = {
'Adel and Wharfedale': 'Q55466756',
'Alwoodley': 'Q2131866',
'Ardsley and Robin Hood': 'Q55466797',
'Armley': 'Q55466825',
'Beeston and Holbeck': 'Q55466755',
'Bramley and Stanningley': 'Q55466807',
'Burmantofts and Richmond Hill': 'Q55466805',
'Calverley and Farsley': 'Q55466776',
'Chapel Allerton': 'Q55466785',
'Cross Gates and Whinmoor': 'Q55466803',
'Farnley and Wortley': 'Q55466762',
'Garforth and Swillington': 'Q55466783',
'Gipton and Harehills': 'Q55466757',
'Guiseley and Rawdon': 'Q55466823',
'Harewood': 'Q55466789',
'Headingley and Hyde Park': 'Q55466799',
'Horsforth': 'Q55466779',
'Hunslet and Riverside': 'Q55466821',
'Killingbeck and Seacroft': 'Q55466786',
'Kippax and Methley': 'Q55466808',
'Kirkstall': 'Q55466761',
'Little London and Woodhouse': 'Q55466753',
'Middleton Park': 'Q6842031',
'Moortown': 'Q55466778',
'Morley North': 'Q55466800',
'Morley South': 'Q55466782',
'Otley and Yeadon': 'Q55466766',
'Pudsey': 'Q55466813',
'Rothwell': 'Q55466765',
'Roundhay': 'Q2735485',
'Temple Newsam': 'Q55466781',
'Weetwood': 'Q55466759',
'Wetherby': 'Q55466810',
}

BASE_URL = 'https://democracy.leeds.gov.uk/mgMemberIndex.aspx?VW=TABLE&PIC=1&FN='

parsedMembers = []
unreconciledWards = []
unreconciledParties = []

print('(i) Scraping from ' + BASE_URL)

# Get the page!
html = scraperwiki.scrape(BASE_URL)
ssRoot = lxml.html.fromstring(html)

rows = ssRoot.cssselect('#mgTable1 tr')

# Skip the header row
for row in rows[1:]:

memberData = {}

print row

nameLink = row.cssselect('a')[0]

nameUnparsed = nameLink.text.strip()

nameRegex = re.search('(.+?) (.+)', nameUnparsed)
memberData['honorific_string'] = nameRegex.group(1)

memberData['name'] = cleanup(nameRegex.group(2))

# print(' ' + memberData['name'])

linkHref = nameLink.attrib['href']

idRegex = re.search('mgUserInfo\.aspx\?UID=([0-9]+)', linkHref)
memberData['id'] = idRegex.group(1)

memberData['url'] = cleanup('https://democracy.leeds.gov.uk/mgUserInfo.aspx?UID=' + memberData['id'])

partyName = row.cssselect('td')[2].text
memberData['party'] = partyName

if partyName in PARTY_MAP:
memberData['party_id'] = PARTY_MAP[partyName]
else:
unreconciledParties.append(partyName)

wardName = row.cssselect('td')[3].text
memberData['ward'] = wardName

if wardName in WARD_MAP:
memberData['ward_id'] = WARD_MAP[wardName]
else:
unreconciledWards.append(wardName)

print memberData
parsedMembers.append(memberData)


print('(i) Done.')
print '(i) Counted {} Members in total'.format(len(parsedMembers))
print '<!> {} unreconciled wards:'.format(len(unreconciledWards))
print unreconciledWards
print '<!> {} unreconciled parties:'.format(len(unreconciledParties))
print unreconciledParties


try:
scraperwiki.sqlite.execute('DELETE FROM data')
except sqlite3.OperationalError:
pass
scraperwiki.sqlite.save(
unique_keys=['id'],
data=parsedMembers)

0 comments on commit e311ad3

Please sign in to comment.