In [3]:
from bs4 import BeautifulSoup
import re
import spacy
import json
import requests

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }

urls_act = ['https://sso.agc.gov.sg/Browse/Act/Current/All?PageSize=500&SortBy=Title&SortOrder=ASC',
            'https://sso.agc.gov.sg/Browse/Act/Current/All/1?PageSize=500&SortBy=Title&SortOrder=ASC']

In [4]:
acts = dict()
for url in urls_act:
    # each page (500 results) in the browse section
    r = requests.get(url, headers=headers)
    assert r.status_code == requests.status_codes.codes.ok
    soup = BeautifulSoup(r.text)
    for a in soup('a'):
        if a.get('rel') is not None:
            acts[a.get_text()] = a['href']

In [5]:
acts

{'Accountants Act': '/Act/AA2004',
 'Accounting and Corporate Regulatory Authority Act': '/Act/ACRAA2004',
 'Accounting Standards Act': '/Act/ASA2007',
 'Active Mobility Act 2017': '/Act/AMA2017',
 'Administration of Justice (Protection) Act 2016': '/Act/AJPA2016',
 'Administration of Muslim Law Act': '/Act/AMLA1966',
 'Adoption of Children Act': '/Act/ACA1939',
 'Advance Medical Directive Act': '/Act/AMDA1996',
 'Agency for Science, Technology and Research Act': '/Act/ASTRA1990',
 'Air Navigation Act': '/Act/ANA1966',
 'Allied Health Professions Act': '/Act/AHPA2011',
 'Amusement Rides Safety Act': '/Act/ARSA2011',
 'Animals and Birds Act': '/Act/ABA1965',
 'Application of English Law Act': '/Act/AELA1993',
 'Apportionment Act': '/Act/AA1928',
 'Apportionment of Rents Act': '/Act/ARA1909',
 'Appraisers Act': '/Act/AA1906',
 'Arbitration (International Investment Disputes) Act': '/Act/AIIDA1968',
 'Arbitration Act': '/Act/AA2001',
 'Architects Act': '/Act/AA1991',
 'Arms and Explosives

In [6]:
def download_part(url, headers):
    r = requests.get(url, headers=headers)
    if r.status_code != requests.status_codes.codes.ok:
        print('URL not found: ' + url)
        return ''
    return r.text


def download_act(url, headers):
    parts = []
    r = requests.get(url, headers=headers)
    if r.status_code != requests.status_codes.codes.ok:
        print('URL not found: ' + url)
        return []
    parts.append(r.text)
    soup = BeautifulSoup(r.text)
    data = json.loads(soup('div', class_='global-vars')[1].get('data-json'))
    toc_sys_id = data['tocSysId']
    series_ids = [div.get('data-term') for div in soup('div', class_='dms')]
    for series_id in series_ids:
        frag_sys_id = data['fragments'][series_id]['Item1']
        dt_id = data['fragments'][series_id]['Item2']
        url = "https://sso.agc.gov.sg/Details/GetLazyLoadContent?TocSysId={}&SeriesId={}".format(toc_sys_id, series_id) + \
        "&ValidTime=&TransactionTime=&ViewType=&V=25&Phrase=&Exact=&Any=&Without=&WiAl=&WiPr=&WiLT=&WiSc=" + \
        "&WiDT=&WiDH=&WiES=&WiPH=&RefinePhrase=&RefineWithin=&CustomSearchId=&FragSysId={}&_={}".format(frag_sys_id, dt_id)
        parts.append(download_part(url, headers))
    return parts


def stitch_parts(parts):
    first, *remaining = parts
    insert_idx = first.find('<div class="dms"')
    return first[:insert_idx] + ''.join(remaining) + first[insert_idx:]


In [7]:
main_url = 'https://sso.agc.gov.sg'

In [8]:
url = main_url + acts['Accountants Act']
parts = download_act(url, headers)
content = stitch_parts(parts)

In [9]:
soup = BeautifulSoup(re.sub(r'<strong>.+?</strong>', '', re.sub(r'<em>(.+?)</em>', r'\1', content)))

In [10]:
subsecs = []
for prov1 in soup.find(id='legisContent')(class_='prov1'):
    if prov1.find(class_='def'):
        lines = [
            re.sub(
                r';( and| or)? \([a-z]{1,2}\)', r'\1', 
                defn.get_text(' ').split('[')[0].replace('\xa0', ' ').replace('— (a)', ''))
            for defn in prov1(class_='def')]
        subsecs += [' '.join(l.split()) for l in lines]
        continue
    
    lines = prov1.get_text('\n').split('\n')[1:]
    subsec = []
    for line in lines:
        if line.startswith('[') and line.endswith(']'):
            continue
        if line.startswith('(') and line.endswith(')') and len(line.split()) == 1:
            continue
        if '\xa0\xa0' in line:
            if subsec:
                subsecs.append(' '.join(subsec))
            line = line.split('\xa0\xa0')[-1].replace('\xa0', ' ').strip('— ')
            subsec = [' '.join(line.split())]
        else:
            subsec.append(' '.join(line.replace('\xa0', ' ').strip('— ').split()))        
subsecs

['“accounting corporation” means a company approved as an accounting corporation under section 17;',
 '“accounting firm” means a firm approved as an accounting firm under section 18;',
 '“accounting limited liability partnership” or “accounting LLP” means a limited liability partnership approved as an accounting limited liability partnership under section 18A;',
 '“alternate address” means an alternate address maintained with the Registrar under section 12C that meets the requirements of that section;',
 '“Authority” means the Accounting and Corporate Regulatory Authority established under the Accounting and Corporate Regulatory Authority Act (Cap. 2A);',
 '“Chairman” means the Chairman of the Oversight Committee;',
 '“company” has the same meaning as in the Companies Act (Cap. 50);',
 '“Complaints and Disciplinary Panel” means the Complaints and Disciplinary Panel appointed under section 39;',
 '“Complaints Committee” means a Complaints Committee constituted under Part VI;',
 '“corpor

In [12]:
nlp = spacy.load('en_core_web_sm')

In [50]:
doc = nlp(subsecs[22])
list(doc)

[“,
 Register,
 of,
 Public,
 Accounting,
 Corporations,
 ”,
 means,
 the,
 register,
 kept,
 and,
 maintained,
 under,
 section,
 5(1)(a)(ii,
 ),
 ;]

In [51]:
list(doc.sents)

[“Register of Public Accounting Corporations” means the register kept and maintained under section 5(1)(a)(ii);]

In [49]:
subsecs[22]

'“Register of Public Accounting Corporations” means the register kept and maintained under section 5(1)(a)(ii);'