In [1]:
import re
import os
import time
import shutil
import config
from word2number import w2n
from selenium import webdriver
from tqdm.notebook import tqdm
from concurrent.futures import TimeoutError                                  
from tqdm.notebook import tqdm
import pandas as pd
# for browser interaction & load waiting:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [2]:
def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f

In [3]:
# returns target element once it is loaded
def wait_for(element, by, timeout=10, multi=False):
    global d
    time.sleep(.25)
    try:
        WebDriverWait(d, timeout).until(
            EC.presence_of_element_located((by, element)))
    except TimeoutException:
        print(f"Timeout limit reached ({timeout} s)")
        print(f"> Seaching for {element} by: {by}")
    finally:
        time.sleep(.25)
        if multi: return d.find_elements(by, element)
        else: return d.find_element(by, element)

In [4]:
def launch_new_search(verb=True):
    global d
    # open search panel
    url = 'https://scerisecm.boston.gov/ScerIS/CmPublic/#/Home'
    open_search = '/html/body/div[1]/div/div/div[2]/div[3]/search-dashboard/div/div/div[2]/div/div/div/div/ul/li[3]/dashboard-folders-renderer/div/div/div/a[3]'
    if d: d.quit()
   
    d = webdriver.Chrome()
    d.set_window_size(1000,800)
    d.set_window_position(0,0)

    d.get(url)
    time.sleep(.5)
    wait_for(open_search, By.XPATH).click()
    return d

In [5]:
def get_search_params(verb=True):
    # define search params and their corresponding input elements on the page
    params = wait_for("div[ng-repeat='dynamicTerm in dynamicSearchFieldTerms'",
                      By.CSS_SELECTOR, multi=True)
    names = [p.text.split('\n')[0] for p in params]
    input_containers = wait_for('input-with-help', By.CLASS_NAME, multi=True)
    inputs = [i.find_element_by_tag_name('input') for i in input_containers]
    
    # list available params
    search_param = {n: {'title':p,'input':i}  for n, p, i in zip(names, params, inputs)}
    
    return(search_param)

In [6]:
def set_search(param, keys, verb=True):
    global search_param
    param = search_param[param]
    if verb:
        d.execute_script("arguments[0].scrollIntoView();", param['title'])
    param['input'].send_keys(keys)

In [7]:
def download_file(adco_adrs, adrs, permit, n, n_items, year,
                  download_dir='../../Downloads', target_dir='permit_pdfs/'):
    time.sleep(.5)  

    def drop_ext(fn):
        return ''.join(fn.split('.')[:-1])

    # target file name 
    info = year+' - '+str(permit)+'-'+str(n)
    
    # if file already exists, cancel download
    
    pre_down = [drop_ext(p) for p in listdir_nohidden(target_dir+adco_adrs+'/'+adrs)]
    if any(info in p for p in pre_down):
        print(f'    |-[{n+1}/{n_items}] Already downloaded: {info} ')
        return
        
    
    print(f'    |-[{n+1}/{n_items}] Downloading {info}...')

    item = d.find_elements(By.CLASS_NAME, 'ui-grid-row')[n].click()
    wait_for("button[ng-click='openDocuments()']", By.CSS_SELECTOR).click()
    wait_for("button[data-pcc-toggle='dialog-download']", By.CSS_SELECTOR).click()
    wait_for("button[data-pcc-download='download']", By.CSS_SELECTOR).click()
    pages = int(d.find_element(By.CSS_SELECTOR, 'span[data-pcc-pagecount]').text)
    
    wait_for("pcc-overlay-download", By.CLASS_NAME).click()
    
    # if big document, download first ten pages only
    if pages > 10:
        print('      |-Large file - splicing first 10 pages...')
        n = str(n)+'_first10pg'
        wait_for("rdoRangeCustom", By.ID).click()
        wait_for("txtCustomRange", By.ID).send_keys('1-10')
    wait_for("OK", By.ID).click()
    
    
    # target current download in the downloads folder
    print(f"      |-Downloading '{filename}'...")
    filename = max([f for f in os.listdir(download_dir)],
                   key=lambda xa : os.path.getctime(os.path.join(download_dir,xa)))

    # monitor file for download completion
    waits = 0
    while '.part' in filename or 'crdownload' in filename: # wait while downloading
        filename = max([f for f in os.listdir(download_dir)],
                   key=lambda xa : os.path.getctime(os.path.join(download_dir,xa)))
                   # credit to dmb for targeting download file
                   # stackoverflow.com/questions/34548041
        waits +=1
        time.sleep(.33)
        print(f"      |-Downloading '{info}'...")
        if waits>30: # 10 seconds
            raise TimeoutError("error:|-Download timed out!")
    
    # rename new file
    newname = info+'.'+filename.split('.')[-1] # add original extension
    os.rename(os.path.join(download_dir, filename), os.path.join(download_dir, newname))
    print(f"      |-File renamed to '{filename}'...")

    # move file to this directory
    time.sleep(.2)
    shutil.move(os.path.join(download_dir, newname), target_dir+adco_adrs+'/'+adrs+'/'+newname)
    print(f"      |-File moved: '{newname}'")

    if o_len == os.listdir(target_dir+adco_adrs+'/'+adrs):
        raise TimeoutError('error:|-File not found after download!')
    print(f"      |-File saved: '{newname}'")
    
    wait_for("btnBackToSearchResult", By.ID).click()

In [8]:
def scrape_building_docs(row, target_dir='permit_pdfs/'):
    print('—'*30)
    adco_adrs = str(row['GPSaddress'])
    st_num = row['Primary Street Number']
    st_name = row['Street Name']
    adrs = f'{st_num}_{st_name}'
    permit = row['Permit Number']
    year = row['Permit Date'][-4:]
    
    # launch fresh search
    print('  |-Launching new search...')
    try: launch_new_search()
    except:
        if d: d.quit()
        d = False
        launch_new_search()
    
    # set search params
    global search_param
    print('  |-Defining search parameters...')
    search_param = get_search_params()
    set_search('Document Type', 'C/O')
    params = ['Primary Street Number', 'Street Name', 'Permit Number']
    for param in params:
        set_search(param, str(row[param]))
        time.sleep(.25)
    # get search results
    print('  |-Getting results...')
    time.sleep(.5)
    wait_for('btnSearch', By.ID).click()
    
    # create target download directory (to move file to)
    if not os.path.exists(target_dir+adco_adrs+'/'+adrs):
        os.makedirs(target_dir+adco_adrs+'/'+adrs)
        print('    |-Creating parent directory...')
    else: print('    |-Found parent directory...')

    # check pre-existing files
    o_len = len(os.listdir(target_dir+adco_adrs+'/'+adrs))
    print('    |-'+target_dir+adco_adrs+'/'+adrs+'/')
    print(f'      |-{o_len} existing files found')
    
    downloaded  = 0 # for logging 
    time.sleep(.5)
    time.sleep(1)
    items = wait_for('ui-grid-row', By.CLASS_NAME, multi=True)
    n_items = len(items)

    for n in range(n_items):
        download_file(adco_adrs, adrs, permit, n, n_items, year)
        downloaded +=1 
        
    print(f'  |-Done - {downloaded} files downloaded')
    print('—'*60)
        

---

# scrape

In [9]:
def launch_scraper():
    
    start = time.gmtime()
    globals()['to_search'] = pd.read_csv('sceris_adco_merged.csv')
    global to_search
    globals()['d'] = False
    global d
    max_rdx_tries = 3
    completed = 0
    
    to_search = to_search.drop_duplicates(
        subset=['Primary Street Number', 'Street Name', 'Permit Number'])
    
    
    deepest_idx = to_search[to_search['Permit Number']=='COO712195'].index[0]
    to_search = to_search.loc[deepest_idx+1:]
    
    for ridx in tqdm(to_search.index):
        row = to_search.loc[ridx]
        st_num = row['Primary Street Number']
        st_name = row['Street Name']
        adco_adrs = row['GPSaddress']
        permit = row['Permit Number']

        adrs = str(st_num)+'_'+st_name
        print('> Acquiring C/Os for', adrs.strip(), '-', permit.strip())
        wait_between = 2
        ridx_tries = 0
        success = False
        while True:
            ridx_tries+=1
            if ridx_tries > max_rdx_tries: break
            try:
                scrape_building_docs(row)
                success = True
                time.sleep(.2)
                break
            except:
                print(f'>>> Failed! Retrying... (Try {ridx_tries}/{max_rdx_tries})')
                d.quit()
                d = False
                time.sleep(wait_between)
                wait_between **= 2
        if not success:
            print('*'*5, 'ERROR: FAILED on ridx:', ridx)
            try: d.quit()
            except: pass
            print('ERROR DETAILS: > Building ID:', row['Building ID'])
        else:
            completed += 1
            
    print(f'\n>>> Completed downloading C/Os for{completed}/{len(to_search)} rows from target data.')
    print(f'\n  > Time elapsed: {time.strftime(time.gmtime()-start)}')
    

In [None]:
launch_scraper()

HBox(children=(FloatProgress(value=0.0, max=1542.0), HTML(value='')))

> Acquiring C/Os for 103_ARCH - 46020
——————————————————————————————
  |-Launching new search...
  |-Defining search parameters...
  |-Getting results...
    |-Found parent directory...
    |-permit_pdfs/103 Arch Street, Boston MA 02110/103_ARCH                                    /
      |-0 existing files found
    |-[1/1] Downloading 2002 - 46020-0...
>>> Failed! Retrying... (Try 1/3)
——————————————————————————————
  |-Launching new search...
  |-Defining search parameters...
  |-Getting results...
    |-Found parent directory...
    |-permit_pdfs/103 Arch Street, Boston MA 02110/103_ARCH                                    /
      |-0 existing files found
    |-[1/1] Downloading 2002 - 46020-0...
>>> Failed! Retrying... (Try 2/3)
——————————————————————————————
  |-Launching new search...
>>> Failed! Retrying... (Try 3/3)
***** ERROR: FAILED on ridx: 78
ERROR DETAILS: > Building ID: 0
> Acquiring C/Os for 103_ARCH - 632292
——————————————————————————————
  |-Launching new search...
>>>