In [157]:
import sys
import urllib3
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim, OpenMapQuest

req = urllib3.PoolManager()
locator = Nominatim(user_agent='myGeocoder')
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [253]:
def get_permit_url(permit_no):
    return f'https://www.pa.org.mt/pacasedetails?CaseType=PA/{permit_no:05d}/19'

def format_date(date):
    return datetime.strftime(datetime.strptime(date,'%d %B %Y'), '%Y-%m-%d')

# Conver date fields to %Y-%m-%d format
def update_date_fields(permit_details):
    date_fields = [key for key in permit_details.keys() if 'date' in key]
    for field in date_fields:
        try:
            permit_details[field] = format_date(permit_details[field])
        except:
            permit_details[field] = permit_details[field]
    return permit_details

def get_board_minutes_url(soup, permit_details):
    permit_details['boardminutes'] = 'pa.org.mt/'+soup.find('img', {'alt':'Link to View Board Minutes'}).parent['href']

def get_map_coordinates(permit):
    site_address = permit['locationofdevelopment']
    permit['road'] = site_address.split(',')[-2:-1][0]
    permit['city'] = site_address.split(',')[-1:][0].strip()
    try:
        location = locator.geocode(permit['road'] + ' ' + permit['city'])
        permit['lat'] = location.latitude
        permit['long'] = location.longitude
    except:
        permit['lat'] = None
        permit['long'] = None
    return permit

def get_permit_details(soup):
    permit = {}
    # Get all fields
    labels = soup.find_all('td', {'class':'fieldLabel'})
    values = soup.find_all('td', {'class':'fieldData'})
    for i in range(0,len(labels)):
        permit[labels[i].text.replace(':','').strip().replace(" ", "").lower()] = values[i].text.strip()
    if bool(permit):
        if 'sitenoticeimage' in permit:
            del permit['sitenoticeimage']

        if 'boardminutes' in permit:
            get_board_minutes_url(soup, permit)

        # Parse address    
        permit = get_map_coordinates(permit)

        # Convert dates
        permit = update_date_fields(permit)
    
    return permit

In [257]:
permits = []
more_permits = True
permit_no = 257
url = get_permit_url(permit_no)
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')

while more_permits:
    # Print current permit number    
    sys.stdout.write("\r" + f'Scraping details of permit {permit_no:05d}/19')
    sys.stdout.flush()
    
    permit_details = get_permit_details(soup)
    permits.append(dict(permit_details))

    permit_no += 1
    url = get_permit_url(permit_no)
    res = req.request('GET', url)
    soup = BeautifulSoup(res.data, 'html.parser')
    more_permits = not 'This Application Number does not exist' in soup.text

Scraping details of permit 05080/19

In [240]:
upto_256 = pd.DataFrame(permits)

In [258]:
above_256 = pd.DataFrame(permits)

In [263]:
upto_256.to_csv('upto_256.csv', index=False)

In [264]:
above_256.columns

Index(['appealreferencenumber', 'applicant', 'applicationtype', 'architect',
       'boardminutes', 'casecategory', 'casenumber', 'casestatus', 'city',
       'datepublishedinnewspapers', 'descriptionofworks', 'lat',
       'locationofdevelopment', 'long', 'receptiondate',
       'reconsiderationdecisionposteddate',
       'reconsiderationnonexecutabledecisionposteddate', 'representation',
       'representationexpirydate', 'road', 'targetdate', 'validationdate'],
      dtype='object')

In [265]:
upto_256.columns

Index(['appealreferencenumber', 'applicant', 'applicationtype', 'architect',
       'boardminutes', 'casecategory', 'casenumber', 'casestatus', 'city',
       'datepublishedinnewspapers', 'descriptionofworks', 'lat',
       'locationofdevelopment', 'long', 'receptiondate',
       'reconsiderationdecisionposteddate',
       'reconsiderationnonexecutabledecisionposteddate',
       'representationexpirydate', 'road', 'targetdate', 'validationdate'],
      dtype='object')

In [274]:
upto_256.append(above_256).to_csv('permits_2019.csv', index=False)